## Importing Required Libraries

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
import gensim
from sklearn.svm import LinearSVC
import random
import matplotlib.pyplot as plt

## Loading the Dataset

In [2]:
imdb = "E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv"
imdb2 = "E:/Momo/Datasets/movie.csv/movie.csv"

df = pd.read_csv(imdb2)
print(df.shape)
df.head()

(40000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data Preprocessing

In [3]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [4]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = gensim.parsing.remove_stopwords(text)
    return text

df['text']=df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label
0,grew (b. ) watching loving thunderbirds. mates...,0
1,"movie dvd player, sat coke chips, expectations...",0
2,people know particular time past like feel nee...,0
3,"great biblical movies, bored death minute movi...",0
4,im die hard dads army fan change that. got tap...,1


## Splitting the dataset into training and testing data

In [5]:
# Split features and labels
X = df["text"]
y = df["label"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape, X_test.shape)

(28000,) (12000,)


## Vectorizing the reviews

In [6]:
# Create feature vectors
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [None]:
# Create feature vectors
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

## Using the model's accuracy as fitness score

In [62]:
def fitness(params):
    C, max_iter = params
    max_iter = int(max_iter)
    model = LinearSVC(C=C, max_iter=max_iter)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return accuracy_score(y_test, y_pred)

## Parameter tuning using Genetic Algorithm

In [24]:
# tournament selection
def tournament_selection(population, tournament_size):
    tournament = random.sample(population, tournament_size)
    tournament.sort(reverse=True)
    return tournament[0], tournament[1]


In [21]:
# uniform crossover
def uniform_crossover(parent1, parent2):
    child1 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    child2 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    return child1, child2


In [22]:
# Gaussian mutation
def gaussian_mutation(individual, mutation_rate):
    mutated_individual = []
    for gene in individual:
        if random.random() < mutation_rate:
            mutated_gene = gene + random.gauss(0, 1)
            mutated_individual.append(mutated_gene)
        else:
            mutated_individual.append(gene)
    return tuple(mutated_individual)


In [63]:
# Define the genetic algorithm using the selection, crossover, and mutation functions
def genetic_algorithm():
    # Set the parameters
    population_size = 20
    mutation_rate = 0.1
    num_generations = 50
    tournament_size = 5

    # Create an initial population of random solutions
    population = []
    for i in range(population_size):
        C = random.uniform(0.1, 10)
        max_iter = random.randint(100, 1000)
        population.append((C, max_iter))

    # Run the genetic algorithm
    for generation in range(num_generations):
        # Evaluate the fitness of each solution
        fitness_scores = [(fitness(params), params) for params in population]
        fitness_scores.sort(reverse=True)

        # Print the best solution in this generation
        print("Generation {}: Best solution = {}".format(generation, fitness_scores[0]))

        # Select the parents for the next generation using tournament selection
        parents = []
        for i in range(population_size // 2):
            parent1, parent2 = tournament_selection(population, tournament_size)
            parents.append((parent1, parent2))

        # Create the next generation using crossover and mutation
        next_generation = []
        for parent_pair in parents:
            child1, child2 = uniform_crossover(parent_pair[0], parent_pair[1])
            child1 = gaussian_mutation(child1, mutation_rate)
            child2 = gaussian_mutation(child2, mutation_rate)
            next_generation.append(child1)
            next_generation.append(child2)

        # Replace the old population with the new generation
        population = next_generation

    # Print the final solution
    fitness_scores = [(fitness(params), params) for params in population]
    fitness_scores.sort(reverse=True)
    print("Final solution = {}".format(fitness_scores[0][1]))

# Run the genetic algorithm to optimize the hyperparameters of LinearSVC
genetic_algorithm()



Generation 0: Best solution = (0.86225, (0.2322265886603608, 711))




Generation 1: Best solution = (0.8544166666666667, (2.3263315746225377, 491))




Generation 2: Best solution = (0.8540833333333333, (5.897808609448853, 489.59578673115436))




Generation 3: Best solution = (0.8535833333333334, (7.9360472506098745, 114.52075536505649))




Generation 4: Best solution = (0.8538333333333333, (7.079065469504334, 114.52075536505649))


