Sentiment analysis - movies review classification

**Import data**

In [3]:
import pandas as pd 

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [6]:
url = "https://raw.githubusercontent.com/PyRPy/ML_Py_Templates/master/DataCamp_Templates_Py/Data/IMDB_sample.csv"

In [7]:
reviews = pd.read_csv(url)
reviews.head()

Unnamed: 0.1,Unnamed: 0,review,label
0,18245,This short spoof can be found on Elite's Mille...,0
1,19801,A singularly unfunny musical comedy that artif...,0
2,3222,"An excellent series, masterfully acted and dir...",1
3,6010,The master of movie spectacle Cecil B. De Mill...,1
4,16355,I was gifted with this movie as it had such a ...,0


**Convert text/words to vectors**

In [8]:
vect = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 2), 
        max_features=200, token_pattern=r'\b[^\d\W][^\d\W]+\b').fit(reviews.review)

In [9]:
X = vect.transform(reviews.review)

In [13]:
reviews_transformed = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
reviews_transformed.head()



Unnamed: 0,acting,action,actor,actors,actually,american,audience,away,bad,beautiful,believe,best,better,big,bit,black,book,br,br br,budget,camera,cast,character,characters,classic,come,comedy,comes,completely,course,day,dead,death,did,didn,different,director,does,doesn,don,...,special,star,stars,start,story,sure,takes,thing,things,think,thought,time,times,true,truly,try,trying,tv,use,used,ve,ve seen,version,want,war,wasn,watch,watched,watching,way,wife,woman,women,work,world,worst,worth,year,years,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154443,0.0,0.0,0.0,0.0,0.178214,0.089107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197021,0.45396,0.0,0.0,0.173056,0.206914,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168496,0.0,0.0,0.179543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.070525,0.0,0.0,0.082068,0.0,0.0,0.100147,0.0,0.0,0.099804,0.0,0.0,0.074134,0.088556,0.0,0.107696,0.0,0.513265,0.256633,0.0,0.0,0.0,0.0,0.0,0.0,0.08996,0.187864,0.096281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165816,0.0,...,0.102089,0.0,0.0,0.0,0.0,0.0,0.098239,0.0,0.088622,0.0,0.0,0.115332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199219,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.330994,0.0,0.0,0.0,0.199116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29234
3,0.0,0.217319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.409072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173699,0.0,0.151979,...,0.0,0.0,0.0,0.0,0.141845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208256
4,0.0,0.114076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130649,0.518572,0.259286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100518,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660883,0.10068,0.0,0.0,0.0,0.0,0.0,0.0


**Prepare the label**

In [9]:
y = reviews.label

**Split the data into train and test sets**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=456)

**Select and fit the model**

In [13]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

**Prediction and model evaluation**

In [14]:
y_pred = log_reg.predict(X_test)

In [16]:
print('Accuracy on the test data: ', accuracy_score(y_test, y_pred))

Accuracy on the test data:  0.7681545636242505


Confusion matrix

In [19]:
print(confusion_matrix(y_test, y_pred))

[[588 175]
 [173 565]]
