In [1]:
import pandas as pd
import numpy as np

filepath_dict = {'yelp':'sentiment labelled sentences/yelp_labelled.txt',
               'amazon':'sentiment labelled sentences/amazon_cells_labelled.txt',
               'imdb':'sentiment labelled sentences/imdb_labelled.txt'}
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['text', 'label'], sep='\t')
    df['source'] = source #Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

yelp = df.loc[df['source']=='yelp']
yelp


Unnamed: 0,text,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [2]:
from sklearn.model_selection import train_test_split

X = yelp['text'].values
y = yelp['label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1000)
print(X_train.shape, type(X_train), y_train.shape, type(y_train))

(750,) <class 'numpy.ndarray'> (750,) <class 'numpy.ndarray'>


array(['The food was barely lukewarm, so it must have been sitting waiting for the server to bring it out to us.',
       'Sorry, I will not be getting food from here anytime soon :(',
       'Of all the dishes, the salmon was the best, but all were great.',
       'The fries were not hot, and neither was my burger.',
       "In fact I'm going to round up to 4 stars, just because she was so awesome.",
       'Will go back next trip out.',
       'This was my first crawfish experience, and it was delicious!',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       'A great way to finish a great.',
       'Best service and food ever, Maria our server was so good and friendly she made our day.',
       'Good food , good service .',
       'My drink was never empty and he made some really great menu suggestions.',
       'Be sure to order dessert, even if you need to pack it to-go - the tiramisu and cannoli are both to die for.',
       'The f

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(X_train)
doc_term_matrix = sparse_matrix.todense()
voc = count_vectorizer.get_feature_names_out()
print(doc_term_matrix.shape, type(doc_term_matrix))

(750, 1510) <class 'numpy.matrix'>


In [4]:
df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out())

df.loc['MVR'] = (df == 0).sum()/len(df)*100

voc = df.columns[df.loc['MVR'] != df.loc['MVR'].max()].to_list()
df

Unnamed: 0,00,10,100,11,12,15,1979,20,2007,23,...,yama,yeah,year,yellow,yellowtail,yucky,yukon,yum,yummy,zero
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
747,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
749,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [5]:
count_vectorizer = CountVectorizer(stop_words='english', vocabulary=voc)
sparse_matrix = count_vectorizer.fit_transform(X_train)
doc_term_matrix = sparse_matrix.todense()
print(doc_term_matrix.shape, type(doc_term_matrix))

(750, 555) <class 'numpy.matrix'>


In [6]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.0015)
selector.fit(doc_term_matrix)
keep_idx=list(selector.get_support(doc_term_matrix.any()))
voc = list(map(count_vectorizer.get_feature_names_out().__getitem__, keep_idx))



In [7]:
count_vectorizer = CountVectorizer(stop_words='english', vocabulary=voc)
sparse_matrix = count_vectorizer.fit_transform(X_train)
doc_term_matrix = sparse_matrix.todense()
print(doc_term_matrix.shape, type(doc_term_matrix))

(750, 555) <class 'numpy.matrix'>


In [8]:
# count_vectorizer = CountVectorizer(vocabulary=voc)
# X_train1 = X_train
# X_test1 = X_test
# X_train1, X_test1 = map(lambda x: count_vectorizer.transform(x), (X_train1, X_test1))

In [9]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier()
# classifier.fit(X_train1, y_train)
# df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out())
# df.loc['RFC'] = classifier.feature_importances_
# rfc_threshold = np.percentile(classifier.feature_importances_, 10)

# voc = df.columns[df.loc['RFC'] > rfc_threshold].to_list()
# rfc_threshold

In [10]:
count_vectorizer = CountVectorizer(stop_words='english', vocabulary=voc)
sparse_matrix = count_vectorizer.fit_transform(X_train)
doc_term_matrix = sparse_matrix.todense()
print(doc_term_matrix.shape, type(doc_term_matrix))

(750, 555) <class 'numpy.matrix'>


In [11]:
from sklearn.feature_selection import chi2

df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out())
chi_threshold = np.percentile(chi2(doc_term_matrix, y_train)[1], 90)
df.loc['CHI'] = chi2(doc_term_matrix, y_train)[1]

voc = df.columns[df.loc['CHI'] < chi_threshold].to_list()
df



Unnamed: 0,10,100,12,20,30,40,absolutely,acknowledged,actually,added,...,wonderful,worse,worst,worth,wouldn,wow,wrong,year,yummy,zero
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
747,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
749,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
count_vectorizer = CountVectorizer(vocabulary=voc)
X_train, X_test = map(lambda x: count_vectorizer.transform(x), (X_train, X_test))
print(X_train.shape, type(X_train))

(750, 463) <class 'scipy.sparse.csr.csr_matrix'>


In [13]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train, y_train)
score =  classifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.768


In [14]:
from sklearn.linear_model import PassiveAggressiveClassifier
classifier = PassiveAggressiveClassifier()
classifier.fit(X_train, y_train)
score =  classifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.716
