## Importing Required Libraries

In [47]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import gensim
from sklearn.svm import LinearSVC
import random

## Loading the Dataset

In [48]:
imdb = "E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv"

df = pd.read_csv(imdb)
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Data Preprocessing

In [49]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [50]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = gensim.parsing.remove_stopwords(text)
    return text

df['review']=df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment
0,reviewers mentioned watching oz episode you'll...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


## Splitting the dataset into training and testing data

In [51]:
# Split features and labels
X = df["review"]
y = df["sentiment"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape, X_test.shape)

(35000,) (15000,)


## Vectorizing the reviews

In [52]:
# Create feature vectors
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

## Using the model's accuracy as fitness score

In [53]:
def fitness(tol, C, intercept_scaling):
    model = LinearSVC(tol=tol, C=C, intercept_scaling=intercept_scaling)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return accuracy_score(y_test, y_pred)

## Parameter tuning using Genetic Algorithm

In [54]:
def tournament(rank):
    random.shuffle(rank)
    loc_max = (0, (0, 0, 0))
    glob_max = []
    for i in range(len(rank)):
        if loc_max[0] < rank[i][0]:
            loc_max = rank[i]
        if (i+1)%2 == 0:
            glob_max.append(loc_max)
            loc_max = (0, (0, 0, 0))
    
    x = []
    y = []
    z = []
    most_fit = (0, (0, 0, 0))
    for i in glob_max:
        x.append(i[1][0])
        y.append(i[1][1])
        z.append(i[1][2])
        if most_fit[0] < i[0]:
            most_fit = i

    return x, y, z, most_fit


In [55]:
def reproduction(x, y, z):
    temp_pop = []
    for i in range(len(x)):
        temp_pop.append(x[i])
        temp_pop.append(y[i])
        temp_pop.append(z[i])

    random.shuffle(temp_pop)

    new_x = []
    new_y = []
    new_z = []
    for _ in range(100):
        new_x.append(random.choice(temp_pop) * random.uniform(0.99, 1.01))
        new_y.append(random.choice(temp_pop) * random.uniform(0.99, 1.01)) 
        new_z.append(random.choice(temp_pop) * random.uniform(0.99, 1.01))

    return new_x, new_y, new_z

In [56]:
def tournament2(rank):
    random.shuffle(rank)
    glob_max = []
    for i in range(len(rank)):
        loc_max = (0, (0, 0, 0))
        for _ in range(10):
            temp = random.choice(rank)
            if loc_max[0] < temp[0]:
                loc_max =  temp
        glob_max.append(loc_max)

    x = []
    y = []
    z = []
    most_fit = (0, (0, 0, 0))
    for i in glob_max:
        x.append(i[1][0])
        y.append(i[1][1])
        z.append(i[1][2])
        if most_fit[0] < i[0]:
            most_fit = i

    return x, y, z, most_fit

# single point crossover
def reproduction2(x, y, z):
    random.shuffle(z)

    for i in range(len(x)):
        x[i] = x[i] * random.uniform(0.9, 1.1)
        y[i] = y[i] * random.uniform(0.9, 1.1)
        z[i] = z[i] * random.uniform(0.9, 1.1)

    return x, y, z

In [57]:
x_pop = []
y_pop = []
z_pop = []

for i in range(10):
    x_pop.append(random.uniform(0, 100))
    y_pop.append(random.uniform(0, 100))
    z_pop.append(random.uniform(0, 100))

In [58]:
xf = 0
yf = 0
zf = 0
max_acc = (0, (0,0,0))
for it in range(100):
    rank = []
    for i in range(len(x_pop)):
        acc = fitness(x_pop[i], y_pop[i], z_pop[i])
        rank.append((acc, (x_pop[i], y_pop[i], z_pop[i])))
        # print(acc)

    # selection
    x, y, z, most_fit = tournament(rank)

    # crossover
    x_pop, y_pop, z_pop = reproduction(x, y, z)

    print(it+1, "->", most_fit[0])

    if most_fit[0] > 95:
        print(most_fit)
        xf = most_fit[1][0]
        yf = most_fit[1][1]
        zf = most_fit[1][2]
        break
    if most_fit[0] > max_acc[0]:
        max_acc = most_fit       
    # print(most_fit)

1 -> 0.8594666666666667
2 -> 0.8787333333333334




KeyboardInterrupt: 

In [None]:
fitness(1e-4, 1.0, 1.0)



0.8615333333333334

In [None]:
max_acc

(0.8791333333333333, (77.56648545154367, 53.7767139197902, 15.716055501853745))

In [None]:
# #implemention Genetic Algoritm

# max_acc = (0, (0, 0, 0))

# for it in range(100):
#     lkj = 0
#     rank = []
#     for individual in population:
#         acc = fitness(individual[0], individual[1], individual[2])
#         rank.append((acc, individual))
    
#     rank.sort()
#     rank.reverse()
#     rank = rank[:5]
    
#     x_pop = []
#     y_pop = []
#     z_pop = []
#     for i in rank:
#         x_pop.append(i[1][0])
#         y_pop.append(i[1][1])
#         z_pop.append(i[1][2])

#     new_population = []
#     for _ in range(10):
#         x = random.choice(x_pop) * random.uniform(0.9, 1.1)
#         y = random.choice(y_pop) * random.uniform(0.9, 1.1)
#         z = random.choice(z_pop) * random.uniform(0.9, 1.1)
#         new_population.append((x, y, z))

#     population = new_population
#     local_best_fit = rank[0][1]
    
#     # if fitness(local_best_fit[0], local_best_fit[1], local_best_fit[2]) >= 90:
#     #     print(local_best_fit)
#     #     temp = local_best_fit
#     #     break
#     if rank[0][0] > max_acc[0]:
#         max_acc = (rank[0][0], local_best_fit)
#     print(it, rank[0][0])

# print(max_acc)