In [1]:
import nltk
import os
import re
import math
import pandas as pd
import numpy as np
import copy
import string
import random
import itertools
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenized_files={}
total_no_of_files=0
os.chdir("C://Users//Hp//Desktop//Projects//Text Mining//bbcsport")
directories = os.listdir()
for directory in directories:
    path = os.path.join(os.getcwd(),directory)
    os.chdir(path)
    files = os.listdir()
    temp = []
    for file in files:
        with open(file,'r') as curr_file:
            s = curr_file.read()
            s = re.sub(r'\d+','',s)
            s = s.translate(str.maketrans("","",string.punctuation))
            s = s.strip()
            s = s.lower()
            t1 = nltk.tokenize.word_tokenize(s)
            t2 = []
            for word in t1:
                lem_word= lemmatizer.lemmatize(word)
                if(lem_word not in stop_words):
                    t2.append(lem_word)
            temp.append(t2)
            total_no_of_files+=1
    tokenized_files[directory]=temp
    os.chdir('..')

In [3]:
#Creating a unique set of all words
wordset=[]
for key in tokenized_files.keys():
    for file in tokenized_files[key]:
        for word in file:
            wordset.append(word)
        
wordset = list(set(wordset))

In [4]:
#Calculating the frequency of occurence of a specific word among all documents
wordFreq={}
for word in wordset:
    count=0
    for key in tokenized_files.keys():
        for file in tokenized_files[key]:
            if word in file:
                count+=1
    wordFreq[word]=count

In [5]:
#Calculating the tf-idf values for all words in the wordset for all documents
X = []
Y = []
athletics=[1,0,0,0,0]
cricket  =[0,1,0,0,0]
football =[0,0,1,0,0]
rugby    =[0,0,0,1,0]
tennis   =[0,0,0,0,1]
for key in tokenized_files.keys():
    file_number=0
    for file in tokenized_files[key]:
        file_tfidf = []
        n = len(file)
        for word in wordset:
            a = file.count(word)
            tf = a/float(n)
            idf= math.log(total_no_of_files/float(wordFreq[word]))
            file_tfidf.append(tf*idf)
        X.append(file_tfidf)
        if(key=='athletics'):
            Y.append(athletics)
        elif(key=='cricket'):
            Y.append(cricket)
        elif(key=='football'):
            Y.append(football)
        elif(key=='rugby'):
            Y.append(rugby)
        else:
            Y.append(tennis)

In [6]:
print(len(Y))

737


In [7]:
def population_initialization(K,P):
    '''
    Parameters:
        K : Number of clusters
        P : Size of Initial Population of chromosomes
    Output:
        P chromosomes of length K*N where N is the number of features
    Function:
        Takes K random datapoints from the dataset and appends them to create a chromosome
        This is repeated P times
    '''
    population=[]
    for i in range(P):
        chromosome=[]
        for j in range(K):
            chromosome.append(X[random.randint(0,len(X))])
        chromosome=list(itertools.chain(*chromosome))
        population.append(chromosome)
    return population

In [8]:
#Function to decide cluster using Euclidean distance
def cosine_similarity(centers,K,datapoint):
    '''
    Parameters:
        centers: List of Cluster centers
        K : number of clusters
        datapoint : Datapoint
    Output:
        Cluster number alloted based on maximum cosine_similarity of the datapoint with the cluster centers
    '''
    #Cosine Similarity    
    t = np.asarray(datapoint)
    normt= np.linalg.norm(t)
    dot=[]
    for i in range(K):
        temp_center = np.asarray(centers[i])
        normc = np.linalg.norm(temp_center)
        cos_sim_value=np.dot(t,temp_center)/(normt*normc)
        dot.append(cos_sim_value)
    
    return dot.index(max(dot))

In [9]:
def fitness_function(K,chromosome,X):
    '''
    Parameters:
        K: Number of clusters
        chromosome: Contains the centers of K clusters
        X : dataset
    Output:
        A fitness value f for the input chromosome
    Function:
        1. Initializes K centers using the chromosome
        2. Divides the dataset into K clusters using functions like cosine_similarity or euclidean_distance
        3. Recomputer cluster centers by averaging the datapoints of that cluster
        4. Computer fitness value as follows:
                M = sum from i in range(1,K)(Mi)
                Mi= sum of distances of datapoints of a cluster from its center
                fitness_value(f) = 1/M
    '''
    centers=[]
    #Dividing the chromosomes into centers
    for i in range(K):
        centers.append(chromosome[i*len(X[0]):len(X[0]*(i+1))])
    
    #Dividing the dataset into clusters
    clusters={}
    for i in range(K):
        clusters[i]=[]
    for i in range(len(X)):
        clusters[cosine_similarity(centers,K,X[i])].append(X[i])
    
    #Recomputing cluster centers
    centers=[]
    for i in range(K):
        temp=clusters[i]
        centers.append(np.sum(np.asarray(temp),axis=0)/len(temp))
    
    #Computing fitness value
    M=0
    for i in range(K):
        temp=clusters[i]
        center=centers[i]
        for j in temp:
            M+=np.linalg.norm(np.asarray(j)-np.asarray(center))
    
    fitness_score = 1/M
    return fitness_score        

In [10]:
def fitness_evaluation(K,population,X):
    '''
    Parameters:
        K: Number of clusters
        population: Contains all the chromosomes of the current generation
        X : dataset
    Output:
        A list of fitness scores of each chromosome in the parent
    '''
    fitness=[]
    for i in population:
        fitness.append(fitness_function(K,i,X))
    return fitness

In [11]:
def roulette_wheel(fitness_scores):
    sum_fitness_scores = np.sum(np.asarray(fitness_scores))
    temp_fitness = copy.deepcopy(fitness_scores)
    temp_fitness.sort(reverse=True)
    t=[]
    for i in range(len(fitness_scores)):
        temp = random.uniform(0,sum_fitness_scores)
        temp_sum=0
        for j in range(len(fitness_scores)):
            temp_sum=temp_sum+temp_fitness[j]
            if temp_sum>temp:
                t.append(fitness_scores.index(temp_fitness[j]))
                break
    t=list(set(t))
    return t[0:4]

In [12]:
def selection(population,fitness_scores):
    '''
    Parameters:
        population: Current populations i.e. set of parent chromosomes
        fitness_score: Fitness scores of the current population
    Ouput:
        population,fitness_score: Set of most fit chromosomes based on their fitness score
    Function:
        returns the list of all chromosome except one with the least fitness score
    '''
    worst_chromosome_index = fitness_scores.index(min(fitness_scores))
    del(fitness_scores[worst_chromosome_index])
    del(population[worst_chromosome_index])
    return population,fitness_scores

In [13]:
def cross_over(parent1,parent2):
    one_point = random.randint(1,len(parent1)-1)
    left_parent_1 = parent1[:one_point]
    right_parent_1= parent1[one_point:]
    left_parent_2 = parent2[:one_point]
    right_parent_2= parent2[one_point:]

    child1 = left_parent_1
    child2 = left_parent_2
    child1.extend(right_parent_2)
    child2.extend(right_parent_1)
    return child1,child2

In [27]:
def cross_over_function(population,fitness_scores,X,K):
    '''
    Parameters:
        population: Parent population chromosomes
        fitness_scores: fitness of parent population
    Output:
        children generation population
    Function:
        We select the 2 best chromosomes and 2 worst chromosomes and perform a cross-over between the bests and worsts and add
        them to the population generating children generation population with 4 different chromosomes.
    '''
    print("Cross-over:")
    c1i,c2i,c3i,c4i=roulette_wheel(fitness_scores)
    chromosome1 = population[c1i]
    chromosome2 = population[c2i]
    chromosome3 = population[c3i]
    chromosome4 = population[c4i]
    
    c1,c2 = cross_over(chromosome1,chromosome2)
    c3,c4 = cross_over(chromosome3,chromosome4)

    population.append(c1)
    population.append(c2)
    population.append(c3)
    population.append(c4)
    
    del(population[c1i])
    del(fitness_scores[c1i])
    del(population[c2i])
    del(fitness_scores[c2i])
    del(population[c3i])
    del(fitness_scores[c3i])
    del(population[c4i])
    del(fitness_scores[c4i])
    
    fitness = fitness_evaluation(K,[c1,c2,c3,c4],X)
    fitness_scores.extend(fitness)
    return population,fitness_scores

In [28]:
def mutation(children, mutation_rate=0.2):
    '''
    Parameters:
        children: Children we get from the genetic cross-over in the previous stage
        mutation_rate: Rate at which we want to mutate the children (0-1)
    Output:
        children with a mutated gene
    Functions:
        We randomly select a number between 0 and 1 using probably a gaussian distribution as mutation is not a frequent
        phenomenon. If that value is less than mutation_rate we mutate the children.
        For mutation we again generate a value alpha between 0 to 1. We then randomly select a position from the chromosome
        and changes its value as following
            let the value at the randomly selected position be v
            v = v (+/-) 2*alpha*v if(v!=0)
            v = v (+/-) 2*alpha   if(v==0)
        (+/-) is selected randomly giving each a 50-50 chance
    '''
    for i in range(len(children)):
        temp = np.random.rand(1)[0]
        if temp<mutation_rate:
            chromosome = children[i]
            position = random.randint(0,len(chromosome)-1)
            alpha = np.random.rand(1)[0]
            toss = random.randint(0,1)
#             print(chromosome[position])
            if toss==0:
                toss=-1
            if chromosome[position]==0.0:
                chromosome[position]=chromosome[position]+toss*2*alpha
            else:
                chromosome[position]=chromosome[position]+toss*2*alpha*chromosome[position]
            children[i]=chromosome
            
    return children

In [31]:
def main_func(X,x_test):
    '''
    Steps:
    Repeat 1-5 till 1 chromosome remains in the population
    1.Initialize Population
    2.Compute fitness scores
    3.Selection
    4.Cross-over
    5.Mutation
    '''
    K = int(input("Enter the number of clusters: "))
    P = int(input("Enter the size of the initial population: "))
    mutation_rate=float(input('''Mutation rate governs how many children you want to be mutated. Enter a value between 0 and 1.
                               Enter the mutation rate: '''))
    #print("Initializing population.....")
    population = population_initialization(K,P)
    #print("Population initialized!")
    gen=0
    while True:
        print("Generation: "+str(gen))
        gen+=1
        #print("Computing population fitness (using cosine_similarity)...")
        fitness=fitness_evaluation(K,population,X)
        children,fitness = cross_over_function(population,fitness,x_train,K)
        children = mutation(children,mutation_rate)
        population=children
        centers=[]
        chromosome = population[fitness.index(max(fitness))]
        for i in range(K):
            centers.append(chromosome[i*len(X[0]):len(X[0]*(i+1))])
        error=0
        for i in range(len(x_test)):
            temp = cosine_similarity(centers,K,x_test[i])
            if y_test[i][temp]==0:
                error+=1
        print("Error: "+str(error/len(x_test)))
    return population[0]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.4)
chromosome = main_func(x_train,x_test)

Enter the number of clusters: 5
Enter the size of the initial population: 30
Mutation rate governs how many children you want to be mutated. Enter a value between 0 and 1.
                               Enter the mutation rate: .5
Generation: 0




Cross-over:
30
Error: 0.6779661016949152
Generation: 1
Cross-over:
30
Error: 0.6779661016949152
Generation: 2
Cross-over:
30
Error: 0.688135593220339
Generation: 3
Cross-over:
30
Error: 0.7152542372881356
Generation: 4
