## Importing Required Libraries

In [14]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score
import gensim
from sklearn.svm import LinearSVC
import random
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import sys

## Loading the Dataset

In [15]:
imdb = "E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv"
imdb2 = "E:/Momo/Datasets/movie.csv/movie.csv"

df = pd.read_csv(imdb2).head(20000)
print(df.shape)
df.head()

(20000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data Preprocessing

In [16]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [17]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = gensim.parsing.remove_stopwords(text)
    return text

df['text']=df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label
0,grew (b. ) watching loving thunderbirds. mates...,0
1,"movie dvd player, sat coke chips, expectations...",0
2,people know particular time past like feel nee...,0
3,"great biblical movies, bored death minute movi...",0
4,im die hard dads army fan change that. got tap...,1


## Splitting the dataset into training and testing data

In [18]:
# Split features and labels
X = df["text"]
y = df["label"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape, X_test.shape)

(14000,) (6000,)


## Vectorizing the reviews

In [19]:
# Create feature vectors
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
len(vectorizer.vocabulary_)

59875

In [20]:
# # Create feature vectors
# vectorizer = TfidfVectorizer()
# train_vectors = vectorizer.fit_transform(X_train)
# test_vectors = vectorizer.transform(X_test)

## Using the model's accuracy as fitness score

In [21]:
def fitness(params):
    C = params[0]
    model = LinearSVC(C=C)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return 1/accuracy_score(y_test, y_pred)

## Parameter tuning using NSGA2

In [22]:
from pymoo.core.problem import Problem
import numpy as np

class ProblemWrapper(Problem):

    def _evaluate(self, params, out, *args, **kwargs):
        res = []
        for param in params:
            res.append(fitness(param))

        out['F'] = np.array(res)

In [23]:
problem = ProblemWrapper(n_var=1, n_obj=1, xl = [0.1], xu= [50.])

In [24]:
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize

algorithm = NSGA2(pop_size=20)

stop_criteria = ('n_gen', 50)


In [25]:
results = minimize(
    problem=problem,
    algorithm=algorithm,
    termination=stop_criteria
)



In [None]:
print(results.F)

[[1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]
 [1.18086991]]
