## Importing Required Libraries

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
import gensim
from sklearn.svm import LinearSVC
import random
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import sys

## Loading the Dataset

In [2]:
imdb = "E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv"
imdb2 = "E:/Momo/Datasets/movie.csv/movie.csv"

df = pd.read_csv(imdb2).head(20000)
print(df.shape)
df.head()

(20000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data Preprocessing

In [3]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [4]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = gensim.parsing.remove_stopwords(text)
    return text

df['text']=df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label
0,grew (b. ) watching loving thunderbirds. mates...,0
1,"movie dvd player, sat coke chips, expectations...",0
2,people know particular time past like feel nee...,0
3,"great biblical movies, bored death minute movi...",0
4,im die hard dads army fan change that. got tap...,1


## Splitting the dataset into training and testing data

In [5]:
# Split features and labels
X = df["text"]
y = df["label"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape, X_test.shape)

(14000,) (6000,)


## Vectorizing the reviews

In [6]:
# Create feature vectors
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
len(vectorizer.vocabulary_)

59875

In [7]:
# # Create feature vectors
# vectorizer = TfidfVectorizer()
# train_vectors = vectorizer.fit_transform(X_train)
# test_vectors = vectorizer.transform(X_test)

## Using the model's accuracy as fitness score

In [8]:
def fitness(params):
    C, intercept_scaling = params
    model = LinearSVC(C=C, intercept_scaling=intercept_scaling)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return accuracy_score(y_test, y_pred)

## Parameter tuning using Genetic Algorithm

In [9]:
# tournament selection
def tournament_selection(population, tournament_size):
    tournament = random.sample(population, tournament_size)
    tournament.sort(reverse=True)
    return tournament[0], tournament[1]


In [10]:
# uniform crossover
def uniform_crossover(parent1, parent2):
    child1 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    child2 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    return child1, child2


In [11]:
# Gaussian mutation
def gaussian_mutation(individual, mutation_rate):
    mutated_individual = []
    for gene in individual:
        if random.random() < mutation_rate:
            mutated_gene = gene + random.gauss(0,1)
            mutated_individual.append(mutated_gene)
        else:
            mutated_individual.append(gene)
    return tuple(mutated_individual)


In [12]:
# Define the genetic algorithm using the selection, crossover, and mutation functions
def genetic_algorithm():
    # Set the parameters
    population_size = 10
    mutation_rate = 0.3
    num_generations = 50
    tournament_size = 5

    # Create an initial population of random solutions
    population = []
    for i in range(population_size):
        C = random.uniform(0 + sys.float_info.epsilon, 1000)
        intercept_scaling = random.uniform(0 + sys.float_info.epsilon, 1000)
        population.append((C, intercept_scaling))

    # Run the genetic algorithm
    for generation in range(num_generations):
        # Evaluate the fitness of each solution
        fitness_scores = [(fitness(params), params) for params in population]
        fitness_scores.sort(reverse=True)

        # Print the best solution in this generation
        print("Generation {}: Best solution = {}".format(generation, fitness_scores[0]))

        # Select the parents for the next generation using tournament selection
        parents = []
        for i in range(population_size // 2):
            parent1, parent2 = tournament_selection(population, tournament_size)
            parents.append((parent1, parent2))

        # Create the next generation using crossover and mutation
        next_generation = []
        for parent_pair in parents:
            child1, child2 = uniform_crossover(parent_pair[0], parent_pair[1])
            child1 = gaussian_mutation(child1, mutation_rate)
            child2 = gaussian_mutation(child2, mutation_rate)
            next_generation.append(child1)
            next_generation.append(child2)

        # Replace the old population with the new generation
        population = next_generation

    # Print the final solution
    fitness_scores = [(fitness(params), params) for params in population]
    fitness_scores.sort(reverse=True)
    print("Final solution = {}".format(fitness_scores[0][1]))

# Run the genetic algorithm to optimize the hyperparameters of LinearSVC
genetic_algorithm()



Generation 0: Best solution = (0.8741666666666666, (482.18299681059807, 445.35437454399465))




Generation 1: Best solution = (0.8741666666666666, (481.77508947811185, 445.62735030309767))




Generation 2: Best solution = (0.8713333333333333, (612.2269148590224, 349.7661814439254))




Generation 3: Best solution = (0.8721666666666666, (612.9106706904497, 349.1860439671439))




Generation 4: Best solution = (0.8708333333333333, (613.7276609970115, 348.8718090826312))




Generation 5: Best solution = (0.865, (612.9461545350213, 667.9691749057552))




Generation 6: Best solution = (0.8728333333333333, (614.4258489765884, 350.01561726459477))




Generation 7: Best solution = (0.8706666666666667, (615.516397241284, 348.85809812079714))




Generation 8: Best solution = (0.8656666666666667, (614.540726068728, 348.9139571765881))




Generation 9: Best solution = (0.8728333333333333, (615.1442139765056, 348.85809812079714))




Generation 10: Best solution = (0.8706666666666667, (617.5831356611956, 351.4745807902979))




Generation 11: Best solution = (0.8708333333333333, (618.0220598240294, 349.87431373387057))




Generation 12: Best solution = (0.8731666666666666, (617.5575782896777, 349.87431373387057))




Generation 13: Best solution = (0.8721666666666666, (618.9765972166507, 349.19018754495903))




Generation 14: Best solution = (0.8726666666666667, (619.7765469055471, 349.36402280801866))




Generation 15: Best solution = (0.8691666666666666, (619.2708487437752, 349.4343860286044))




Generation 16: Best solution = (0.871, (621.1438451164009, 348.23830652933634))




Generation 17: Best solution = (0.8706666666666667, (618.1752352055037, 348.17734278677034))




Generation 18: Best solution = (0.8738333333333334, (620.5026249590754, 348.822255512882))




Generation 19: Best solution = (0.8673333333333333, (621.5934754881073, 348.8246031032196))




Generation 20: Best solution = (0.871, (621.4759673945327, 349.34333896241645))




Generation 21: Best solution = (0.8718333333333333, (621.8909477693103, 351.71187402616215))




Generation 22: Best solution = (0.8728333333333333, (622.5985999991975, 349.1465956432698))




Generation 23: Best solution = (0.8725, (622.3779688475198, 350.05450482035104))




Generation 24: Best solution = (0.8725, (624.1360072909791, 354.2450523817431))




Generation 25: Best solution = (0.873, (624.5915979742953, 352.05679217399023))




Generation 26: Best solution = (0.873, (624.1360072909791, 353.2820029079391))




Generation 27: Best solution = (0.8726666666666667, (624.1360072909791, 354.2450523817431))




Generation 28: Best solution = (0.8723333333333333, (625.2857283174048, 354.0468017886932))




Generation 29: Best solution = (0.8728333333333333, (624.504011763928, 354.2450523817431))




Generation 30: Best solution = (0.8671666666666666, (625.8730050254468, 352.84297839865656))




Generation 31: Best solution = (0.8678333333333333, (625.7518085892734, 350.667212994355))




Generation 32: Best solution = (0.8665, (625.8730050254468, 353.28287503170463))




Generation 33: Best solution = (0.8608333333333333, (628.3749909187593, 354.09004584750477))




Generation 34: Best solution = (0.8726666666666667, (628.3749909187593, 352.89702569492255))




Generation 35: Best solution = (0.8723333333333333, (628.3749909187593, 355.185991584496))




Generation 36: Best solution = (0.8738333333333334, (628.3749909187593, 355.185991584496))




Generation 37: Best solution = (0.8735, (629.8625170657418, 355.185991584496))




Generation 38: Best solution = (0.8726666666666667, (631.0259532690108, 354.09004584750477))




Generation 39: Best solution = (0.8698333333333333, (631.0259532690108, 353.9687175133256))




Generation 40: Best solution = (0.872, (631.443386965558, 354.09004584750477))




Generation 41: Best solution = (0.8723333333333333, (631.443386965558, 354.09004584750477))




Generation 42: Best solution = (0.8736666666666667, (630.4988363798298, 354.4472519921109))




Generation 43: Best solution = (0.8663333333333333, (632.0440499926899, 355.43214832330887))




Generation 44: Best solution = (0.8703333333333333, (630.9654178342809, 353.6912987941901))




Generation 45: Best solution = (0.8631666666666666, (632.0440499926899, 353.17699828326016))




Generation 46: Best solution = (0.8711666666666666, (632.3338822990652, 356.23758293094886))




Generation 47: Best solution = (0.8703333333333333, (633.9398039489416, 355.43214832330887))




Generation 48: Best solution = (0.872, (633.9398039489416, 356.2377901225951))




Generation 49: Best solution = (0.872, (634.6047478521854, 356.41457721581276))




Final solution = (634.2590513563588, 353.8131690668817)




## Optimization using GridSearch

In [13]:
params = {
    'C': [0.1, 1, 10, 100, 200, 300, 400, 500, 600, 1000],
    'intercept_scaling': [0.1, 1, 10, 100, 200, 300, 400, 500, 700, 1000]
}

# Use GridSearchCV to search for the best hyperparameters
model = LinearSVC()
grid_search = GridSearchCV(model, params, cv=5)
grid_search.fit(train_vectors, y_train)

# Print the best hyperparameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)



Best parameters:  {'C': 200, 'intercept_scaling': 200}
Best score:  0.8698571428571429


