## Importing Required Libraries

In [43]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
import gensim
from sklearn.svm import LinearSVC
import random
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import sys
from sklearn.naive_bayes import MultinomialNB

## Loading the Dataset

In [44]:
imdb = "E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv"
imdb2 = "E:/Momo/Datasets/movie.csv/movie.csv"

df = pd.read_csv(imdb2).head(30000)
print(df.shape)
df.head()

(30000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data Preprocessing

In [45]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [46]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = gensim.parsing.remove_stopwords(text)
    return text

df['text']=df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,label
0,grew (b. ) watching loving thunderbirds. mates...,0
1,"movie dvd player, sat coke chips, expectations...",0
2,people know particular time past like feel nee...,0
3,"great biblical movies, bored death minute movi...",0
4,im die hard dads army fan change that. got tap...,1


## Splitting the dataset into training and testing data

In [47]:
# Split features and labels
X = df["text"]
y = df["label"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape, X_test.shape)

(21000,) (9000,)


## Vectorizing the reviews

In [48]:
# Create feature vectors
# vectorizer = CountVectorizer()
# train_vectors = vectorizer.fit_transform(X_train)
# test_vectors = vectorizer.transform(X_test)
# len(vectorizer.vocabulary_)

In [49]:
# # Create feature vectors
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

## Using the model's accuracy as fitness score

In [50]:
# def fitness(params):
#     C, intercept_scaling = params
#     model = LinearSVC(C=C, intercept_scaling=intercept_scaling)
#     model.fit(train_vectors, y_train)
#     y_pred = model.predict(test_vectors)
#     return accuracy_score(y_test, y_pred)

def fitness1(params):
    C, intercept_scaling = params
    model = LinearSVC(C=C, intercept_scaling=intercept_scaling)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return accuracy_score(y_test, y_pred)

def fitness2(params):
    C, intercept_scaling = params
    model = LogisticRegression(C=C, intercept_scaling=intercept_scaling)
    model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    return accuracy_score(y_test, y_pred)

## Parameter tuning using Genetic Algorithm

In [51]:
# tournament selection
def tournament_selection(population, tournament_size):
    tournament = random.sample(population, tournament_size)
    tournament.sort(reverse=True)
    return tournament[0], tournament[1]


In [52]:
# uniform crossover
def uniform_crossover(parent1, parent2):
    child1 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    child2 = [random.choice([parent1[i], parent2[i]]) for i in range(len(parent1))]
    return child1, child2


In [53]:
# Gaussian mutation
def gaussian_mutation(individual, mutation_rate):
    mutated_individual = []
    for gene in individual:
        if random.random() < mutation_rate:
            mutated_gene = gene + random.gauss(0,1)
            mutated_individual.append(max(0, mutated_gene))
        else:
            mutated_individual.append(gene)
    return tuple(mutated_individual)


In [56]:
# Define the genetic algorithm using the selection, crossover, and mutation functions
def genetic_algorithm():
    # Set the parameters
    population_size = 10
    mutation_rate = 0.3
    num_generations = 20
    tournament_size = 5

    # Create an initial population of random solutions
    population = []
    for i in range(population_size):
        C = random.uniform(0.001, 10)
        intercept_scaling = random.uniform(0.001, 10)
        population.append((C, intercept_scaling))

    # Run the genetic algorithm
    for generation in range(num_generations):
        # Evaluate the fitness of each solution
        fitness_scores = [(fitness1(params), params) for params in population]
        # fitness_scores = [(fitness2(params), params) for params in population]
        fitness_scores.sort(reverse=True)

        # Print the best solution in this generation
        print("Generation {}: Best solution = {}".format(generation, fitness_scores[0]))

        # Select the parents for the next generation using tournament selection
        parents = []
        for i in range(population_size // 2):
            parent1, parent2 = tournament_selection(population, tournament_size)
            parents.append((parent1, parent2))

        # Create the next generation using crossover and mutation
        next_generation = []
        for parent_pair in parents:
            child1, child2 = uniform_crossover(parent_pair[0], parent_pair[1])
            child1 = gaussian_mutation(child1, mutation_rate)
            child2 = gaussian_mutation(child2, mutation_rate)
            next_generation.append(child1)
            next_generation.append(child2)

        # Replace the old population with the new generation
        population = next_generation

    # Print the final solution
    fitness_scores = [(fitness1(params), params) for params in population]
    fitness_scores.sort(reverse=True)
    print("Final solution = {}".format(fitness_scores[0][1]))

# Run the genetic algorithm to optimize the hyperparameters of LinearSVC
genetic_algorithm()



Generation 0: Best solution = (0.886, (0.8570070563398765, 1.3818410456963521))




Generation 1: Best solution = (0.8724444444444445, (4.313038545420566, 3.736734464692697))




Generation 2: Best solution = (0.8696666666666667, (7.534908881751572, 3.3838789689333395))




Generation 3: Best solution = (0.8696666666666667, (7.574451437116461, 3.3838789689333395))




Generation 4: Best solution = (0.8692222222222222, (8.10777668678151, 5.747847976014469))




Generation 5: Best solution = (0.8686666666666667, (9.262549574473875, 6.572693455610086))




Generation 6: Best solution = (0.8695555555555555, (7.393042326227211, 6.526551670976585))




Generation 7: Best solution = (0.869, (7.646370775477484, 7.823302432526592))




Generation 8: Best solution = (0.8683333333333333, (10.720176374234288, 7.231425502976532))




Generation 9: Best solution = (0.8674444444444445, (12.850856600899665, 7.275435734339225))




Generation 10: Best solution = (0.8676666666666667, (12.180888703238212, 8.157903388688323))




Generation 11: Best solution = (0.8681111111111111, (12.997758342304293, 8.603852207349009))




Generation 12: Best solution = (0.8698888888888889, (14.403379673864912, 8.88458953028473))




Generation 13: Best solution = (0.8682222222222222, (15.469683137457189, 8.88458953028473))




Generation 14: Best solution = (0.8682222222222222, (14.526415525227266, 8.88458953028473))




Generation 15: Best solution = (0.8684444444444445, (15.676384773270762, 9.934906961262879))




Generation 16: Best solution = (0.8694444444444445, (16.16466448903002, 8.88458953028473))




Generation 17: Best solution = (0.8685555555555555, (16.183577674481423, 9.200809800663956))




Generation 18: Best solution = (0.869, (16.183577674481423, 9.200809800663956))




Generation 19: Best solution = (0.8687777777777778, (16.052119614934632, 10.549110362248012))




Final solution = (16.237379769483763, 9.419496203342344)




## Optimization using GridSearch

In [None]:
# params = {
#     'C': [0.1, 1, 10, 100, 200, 300, 400, 500, 600, 1000],
#     'intercept_scaling': [0.1, 1, 10, 100, 200, 300, 400, 500, 700, 1000]
# }

# # Use GridSearchCV to search for the best hyperparameters
# model = LogisticRegression()
# grid_search = GridSearchCV(model, params, cv=5)
# grid_search.fit(train_vectors, y_train)

# # Print the best hyperparameters and the corresponding score
# print("Best parameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters:  {'C': 10, 'intercept_scaling': 0.1}
Best score:  0.8807857142857143
