In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Data/moviereviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   6000 non-null   object
 1   review  5980 non-null   object
dtypes: object(2)
memory usage: 93.9+ KB


In [5]:
df.isnull().sum()

label      0
review    20
dtype: int64

Checking The Blanks Reviews

In [6]:
blanks_review = []
for index,label,review in df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks_review.append(index)

In [7]:
len(blanks_review)

0

In [8]:
# Because this data don't have blanks reviews we can directly remove the NaN values
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df['review']
y = df['label']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=101)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix

In [14]:
my_model = Pipeline([('tfidf',TfidfVectorizer()), ('clf',LinearSVC())])

In [15]:
my_model.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [16]:
prediction = my_model.predict(X_test)

In [17]:
print(confusion_matrix(y_test,prediction))
print('\n')
print(classification_report(y_test,prediction))

[[835  72]
 [ 58 829]]


              precision    recall  f1-score   support

         neg       0.94      0.92      0.93       907
         pos       0.92      0.93      0.93       887

    accuracy                           0.93      1794
   macro avg       0.93      0.93      0.93      1794
weighted avg       0.93      0.93      0.93      1794



In [18]:
my_model.predict(['it was a great exprience to watching this movie in theater'])

array(['pos'], dtype=object)

In [19]:
my_model.predict(['1 star movie'])

array(['neg'], dtype=object)