In [1]:
import pandas as pd

filepath_dict = {'yelp':'sentiment labelled sentences/yelp_labelled.txt',
               'amazon':'sentiment labelled sentences/amazon_cells_labelled.txt',
               'imdb':'sentiment labelled sentences/imdb_labelled.txt'}
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['text', 'label'], sep='\t')
    df['source'] = source #Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
yelp = df.loc[df['source']=='yelp']
yelp

Unnamed: 0,text,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [2]:
from sklearn.model_selection import train_test_split

X = yelp['text'].values
y = yelp['label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1000)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(X_train)
sparse_matrix

<750x1510 sparse matrix of type '<class 'numpy.int64'>'
	with 3684 stored elements in Compressed Sparse Row format>

In [4]:
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix,
                 columns=count_vectorizer.get_feature_names())
df.loc['MVR'] = (df == 0).sum()/len(df)*100
#Optional: View the vector
df.round(3).sample(6, axis='columns')



Unnamed: 0,00,10,100,11,12,15,1979,20,2007,23,...,yama,yeah,year,yellow,yellowtail,yucky,yukon,yum,yummy,zero
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
747,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
749,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [5]:
df.loc['MVR'].max()

99.86666666666667

In [6]:
keep_voc = df.columns[df.loc['MVR'] != df.loc['MVR'].max()].to_list()

In [7]:
print(len(count_vectorizer.get_feature_names()), len(keep_voc))

1510 555




In [8]:
count_vectorizer = CountVectorizer(vocabulary=keep_voc)
X_train, X_test = map(lambda x: count_vectorizer.transform(x), (X_train, X_test))

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score =  classifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.752
