In [1]:
from sklearn import datasets
import pandas as pd
import os
import numpy as np

In [2]:
path=r"C:\Users\212448576\Downloads\Reuters_data"

In [3]:
iris=datasets.load_iris()

In [4]:
files=os.listdir(path)

In [5]:
files=[x for x in files if '.txt' in x]
files[:10]

['training_crude_10011.txt',
 'training_crude_10078.txt',
 'training_crude_10080.txt',
 'training_crude_10106.txt',
 'training_crude_10168.txt',
 'training_crude_10190.txt',
 'training_crude_10192.txt',
 'training_crude_10200.txt',
 'training_crude_10228.txt',
 'training_crude_1026.txt']

In [6]:
target=[]
article_text=[]

for file in files:
    if '.txt' not in file:continue
    f=open(path+'\\'+file,encoding='latin-1')
    article_text.append(" ".join([line.strip() for line in f if line.strip()!=""]))
    
    if "crude" in file:
        target.append("crude")
    else:
        target.append("money")
    f.close()

In [7]:
mydata=pd.DataFrame({'target':target,'article_text':article_text})

In [8]:
mydata.head()

Unnamed: 0,target,article_text
0,crude,CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadia...
1,crude,BP &lt;BP> DOES NOT PLAN TO HIKE STANDARD &lt;...
2,crude,BP&lt;BP> OFFER RAISES EXPECTATIONS FOR OIL VA...
3,crude,USX &lt;X> SAYS TALKS ENDED WITH BRITISH PETRO...
4,crude,BP &lt;BP> MAY HAVE TO RAISE BID - ANALYSTS B...


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
article_train,article_test=train_test_split(mydata,test_size=0.2,random_state=2)

In [11]:
article_train.head()

Unnamed: 0,target,article_text
280,crude,"MALAYSIA TO CUT OIL OUTPUT FURTHER, TRADERS SA..."
688,money,BANKERS OPPOSE STRICT TAIWAN CURRENCY CONTROLS...
375,crude,"OPEC WITHIN OUTPUT CEILING, SUBROTO SAYS Opec ..."
665,money,U.K. MONEY MARKET SHORTAGE FORECAST AT 300 MLN...
589,money,"CURRENCY FUTURES TO KEY OFF G-5, G-7 MEETINGS ..."


In [12]:
np.argmax(article_text)

410

In [13]:
article_train.reset_index(inplace=True,drop=True)
article_test.reset_index(inplace=True,drop=True)

In [14]:
y_train=(article_train['target']=='money').astype(int)
y_test=(article_test['target']=='money').astype(int)

In [15]:
# import nltk
# nltk.download()

In [16]:
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import stopwords
# from string import punctuation
# from nltk.tokenize import word_tokenize
# lemma=WordNetLemmatizer()
# my_stop=set(stopwords.words('english')+list(punctuation))

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize # used to break sentences into words
lemma = WordNetLemmatizer()
my_stop=set(stopwords.words('english')+list(punctuation)) 


In [17]:
def split_into_lemmas(message):
    message=message.lower()
    words=word_tokenize(message)
    word_sans_stop=[]
    for word in words:
        if word in my_stop:continue
        word_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in word_sans_stop]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [219]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tf=CountVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=500,stop_words=my_stop)

In [220]:
tfidf=TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=500,stop_words=my_stop)

In [20]:
tf.fit(article_train['article_text'])

CountVectorizer(analyzer=<function split_into_lemmas at 0x102C5A08>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=500,
        max_features=None, min_df=20, ngram_range=(1, 1),
        preprocessor=None,
        stop_words={'aren', 'she', 'is', 'doing', "isn't", 'out', "she's", "shan't", 'yourselves', "mightn't", 'were', 'other', "needn't", 'my', '`', 'his', 'these', 'yourself', 'them', 'once', 'under', 'or', 'themselves', 'yours', 'until', 'few', 'me', 'there', 'too', "you'd", "it's", 'a', 'more', 'above',... 'some', "'", 'do', 'again', 'hers', 'only', '#', "weren't", '(', 'have', "mustn't", 'ma', 'myself'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [221]:
tfidf.fit(article_train['article_text'])

TfidfVectorizer(analyzer=<function split_into_lemmas at 0x102C5A08>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=500,
        max_features=None, min_df=20, ngram_range=(1, 1), norm='l2',
        preprocessor=None, smooth_idf=True,
        stop_words={'aren', 'she', 'is', 'doing', "isn't", 'out', "she's", "shan't", 'yourselves', "mightn't", 'were', 'other', "needn't", 'my', '`', 'his', 'these', 'yourself', 'them', 'once', 'under', 'or', 'themselves', 'yours', 'until', 'few', 'me', 'there', 'too', "you'd", "it's", 'a', 'more', 'above',... 'some', "'", 'do', 'again', 'hers', 'only', '#', "weren't", '(', 'have', "mustn't", 'ma', 'myself'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
train_tf=tf.transform(article_train['article_text'])

In [222]:
train_tf=tfidf.transform(article_train['article_text'])

In [223]:
train_tf

<741x677 sparse matrix of type '<class 'numpy.float64'>'
	with 36338 stored elements in Compressed Sparse Row format>

In [224]:
print(train_tf.shape)
print(type(train_tf.shape))
print(train_tf.toarray())

(741, 677)
<class 'tuple'>
[[0.         0.16087484 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.077667   0.06273514 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.06791456 0.         ... 0.         0.         0.        ]
 [0.         0.08040204 0.         ... 0.         0.         0.        ]
 [0.02693875 0.13055784 0.         ... 0.         0.06033226 0.        ]]


In [225]:
x_train_tf=pd.DataFrame(train_tf.toarray(),columns=tf.get_feature_names())
x_train_tf.head()

Unnamed: 0,'','s,--,...,1,1.5,10,100,15,15.8,....1,work,working,world,worth,would,year,yen,yesterday,yet,york
0,0.0,0.160875,0.0,0.0,0.090905,0.0,0.12917,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.123951,0.045114,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.069069,0.0,0.0,0.0,0.0,0.0
2,0.077667,0.062735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177247,...,0.0,0.0,0.0,0.0,0.12084,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136024,0.0,0.0,...,0.0,0.0,0.0,0.0,0.188994,0.0,0.0,0.0,0.0,0.0
4,0.139676,0.016117,0.076097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.103485,0.0,0.203471,0.0,0.0,0.0


In [226]:
x_train_tf.columns

Index(['''', ''s', '--', '...', '1', '1.5', '10', '100', '15', '15.8',
       ...
       'work', 'working', 'world', 'worth', 'would', 'year', 'yen',
       'yesterday', 'yet', 'york'],
      dtype='object', length=677)

In [229]:
test_tf=tfidf.transform(article_test['article_text'])
test_tf.toarray()

array([[0.03963967, 0.01600937, 0.        , ..., 0.        , 0.04438868,
        0.        ],
       [0.        , 0.0476639 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.10564493, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.04776287, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07598587, 0.02045907, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [230]:
x_test_tf=pd.DataFrame(test_tf.toarray(),columns=tf.get_feature_names())
x_test_tf.head(3)

Unnamed: 0,'','s,--,...,1,1.5,10,100,15,15.8,....1,work,working,world,worth,would,year,yen,yesterday,yet,york
0,0.03964,0.016009,0.0,0.0,0.0,0.0,0.032136,0.044389,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041116,0.0,0.0,0.0,0.044389,0.0
1,0.0,0.047664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.105645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.085969,0.0,0.05479,0.0,0.0,0.0,0.0,0.0


In [231]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [29]:
KNeighborsClassifier?

In [92]:
knn=KNeighborsClassifier(n_neighbors=10)

In [93]:
knn.fit(x_train_tf,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [94]:
predictions=knn.predict(x_test_tf)

In [95]:
accuracy_score(y_test,predictions)

0.9623655913978495

In [188]:
SVC?

In [266]:
clf_svm=SVC(C=200)

In [267]:
clf_svm.fit(x_train_tf,y_train)

SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [268]:
accuracy_score(y_train,clf_svm.predict(x_train_tf))

0.9946018893387314

In [269]:
accuracy_score(y_test,clf_svm.predict(x_test_tf))

0.989247311827957

In [197]:
MultinomialNB?

In [37]:
clf_nb=MultinomialNB()

In [38]:
clf_nb.fit(x_train_tf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
accuracy_score(y_test,clf_nb.predict(x_test_tf))

0.989247311827957