In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movies = pd.read_csv('../../DATA/moviereviews.csv')
movies.head()

In [None]:
movies = movies.dropna()
movies.isnull().sum()

In [None]:
movies = movies[~movies['review'].str.isspace()]
movies[movies['review'].apply(lambda review: review == '')]

In [None]:
movies.info()

In [None]:
movies['label'].value_counts()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')

matrix = count_vect.fit_transform(movies[movies['label']=='pos']['review'])
freqs = zip(count_vect.get_feature_names_out(),matrix.sum(axis=0).tolist()[0])
print(sorted(freqs,key=lambda x: -x[1])[:20])

In [None]:
y = movies['label']
X = movies['review']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
pipe = Pipeline([('tfidf',TfidfVectorizer()),('svc',LinearSVC())])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
def report(model):
  preds = model.predict(X_test)
  print(classification_report(y_test,preds),"\n")
  print(confusion_matrix(y_test,preds),"\n")
  print(accuracy_score(y_test,preds),"\n")

In [None]:
preds = pipe.predict(X_test)

In [None]:
preds

In [None]:
print(accuracy_score(y_test,preds),'\n')

In [None]:
print(classification_report(y_test,preds))

In [None]:
print(confusion_matrix(y_test,preds))