In [1]:
##Importing IMDB Dataset and cleaning reviews

#Importing libraries
import nltk
nltk.download('all')
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

#Importing dataset and replacing labels with 0 and 1 for classification
df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

#Defining stop_words and lemmatizer
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

#Creating new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\MianH\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\MianH\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\MianH\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\MianH\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\MianH\AppData\Roaming\nltk_data...
[

In [2]:
##Deploying SVM model on available data

#Importing libraries
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

#instantiate the model (using the default parameters)
SVM = SVC()

# fit the model with pre-processed data
SVM.fit(bow_train, y_train)

#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5035
           1       0.86      0.89      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [3]:
    

#Importing libraries
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

#Creating a Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('SVM', SVC())
])

#Defining hyperparameters
parameters = {
    'vect__max_df':[0.1,0.2,0.3,0.4,0.5,0.6,0.7],
    'vect__ngram_range':  [(1,1), (1,2), (1,3)],
    'SVM__kernel': ['poly', 'rbf', 'sigmoid'],
    'SVM__C': [50, 10, 1.0, 0.1, 0.01]}

# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit = True, verbose = 3, cv=5)
grid_result = grid_search.fit(df.loc[:5000, 'Processed_Reviews'].values.astype('U'), df.loc[:5000, 'sentiment'].values.astype('U'))

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 315 candidates, totalling 1575 fits
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.718 total time=  17.3s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.700 total time=  16.0s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.760 total time=  16.8s
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.770 total time=  15.8s
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.734 total time=  19.4s
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.535 total time=  35.3s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.524 total time=  35.0s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.561 total time=

In [None]:
# Extract the best pipeline (which includes the best hyperparameters)
best_model = grid_result.best_estimator_

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the full dataset into training and test sets
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    df['Processed_Reviews'].values.astype('U'),
    df['sentiment'].values.astype('U'),
    test_size=0.2,
    random_state=42
)

# Fit the best model on the training data
best_model.fit(X_train_full, y_train_full)

# Make predictions on the test data
predictions = best_model.predict(X_test_full)

# Print the classification report
print(classification_report(y_test_full, predictions))


              precision    recall  f1-score   support

           0       0.88      0.86      0.87      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

