# EPSI M2 ISD 
# Handy Pedro VALERY & Charles Dehlinger

In [1]:
# Importing libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from IPython.core.display import display, HTML
from functools import reduce
from sklearn import metrics

#display(HTML("<style>.container { width:90% !important; }</style>"))

import re
import random
import statistics

## Data initialisation

In [2]:
# Setting global variables
rgx = re.compile("([\w][\w']*\w)")
text = open("train.txt").readlines()
textLen = len(text)

## Customed functions

In [3]:
# Partitioning the text into 5 portions to used with crossed validations
# Text : Corpus
# K    : Potions

def textToPartions(text,k):
    portions=[]
    increments = round(textLen/k)
    for i in range(0,textLen,increments):
        portions.append((i,i+increments-1))
        
    return portions

In [4]:
# Recreates the original text from a list of portions tuples
# Text : Corpus
# Portions : List of Portions index to retrieve in the text. 

def portionsToSubtext(textArray, portions):
    resText=[]
    for i,j in portions:
        for l in textArray[i:j+1]:
            resText.append(l)
            
    return resText

In [5]:
# Preparing Cross Validation Portions (Train, Test, Validation)

def createShuffledSubText(originalText):
     
    partitions = textToPartions(originalText,5)
    result = { 'Train-Test': [], 'validation': partitions.pop()}
    
    for i in range(len(partitions)):
        
        curr = {'train' : [partitions[i],partitions[(i+1)%4],partitions[(i+2)%4]], 
                'test'  : [partitions[(i+3)%4]],
                'language':{}
                ,'words'  :{}
                , 'score':0, 'failed':[]}
        
        result['Train-Test'].append(curr)
    return result

In [6]:
def tokenize(input_text):
    
    # list of words
    # dict of words
    # total of phrases
    
    tokens_dict = {}
    tokens_list = []
    phrases_cnt = 0;
    
    for s in input_text.lower().split(". "):
        phrases_cnt +=1
        
        for w in rgx.findall(s):
            tokens_list.append(w)
            tokens_dict[w] = ((tokens_dict[w][0])+1,'Pure') if w in tokens_dict else (1,'Pure')
            
    return tokens_list, tokens_dict, phrases_cnt

In [7]:
def wordCounter(row,language,TTP):
    
    # Input the corpus as an array of inputs paragraphes, a target language, returns:
    # list of (length of words in a dictionary, [distinct words count, sentences count], the dictionary of words)
    
    tk = tokenize(row)
    for w in tk[1]:
        
        # Processing the lang dictionary
        #dict[w]= ((dict[w][0])+1,'Pure') if w in dict else (1,'Pure')
        
        w_occurence = tk[1][w][0]  
        #print(w_occurence)
        #Processing the global dictionary
            
        if w not in TTP['words']: 
                               
            TTP['words'][w]= {'occurences':w_occurence,'language':{language: w_occurence }}
            
        elif w in TTP['words']:
            
            TTP['words'][w]['occurences']+= w_occurence
            
            if language not in TTP['words'][w]['language']:
                
                TTP['words'][w]['language'][language] = w_occurence
            else:
                #print(TTP['words'][w][language])
                TTP['words'][w]['language'][language]+= w_occurence
            
    return tk[0], tk[1], tk[2], TTP

In [8]:
def addDict(d1, d2):
    for w in d2:
        if w not in d1:
            d1[w]=d2[w]
        else: 
            d1[w]=((d1[w][0]+d2[w][0]),d1[w][1])
    return d1

In [9]:
def extract(a,b):
    
    for w in a:
        """
        if type(a[w])=='List' and a[w][1]=='Dirty':
            continue;
        if type(a[w])=='int' and w in b:
           
            a[w]=[a[w],'Dirty']         
        else :
            a[w]=[a[w],'Pure']
        """
        if a[w][1]=='Dirty':
            continue;
        elif w in b: 
            a[w]=(a[w][0],'Dirty')
    return a

In [10]:
def inter(l1, l2):
    dict = {}
    for e in l2:
        if e in l1:
            dict[e]=0
    return dict

## Data Preparation
### Global variables containing (Train, Test, Validation) for a total of 4 Portions of the corpus

In [11]:
Data = createShuffledSubText(text)
Train_Test = Data['Train-Test']
Valid = Data['validation']

## Gathering statistics for each portions

In [12]:
#Processing all text inputs in all partitions
for TTP in Train_Test:
    
    TTP_Train = portionsToSubtext(text,TTP['train'])
    
    for i in range(len(TTP_Train)):
        
        #Processing language dictionnary of TTP partition
        lang = TTP_Train[i][1:4]
        wc = wordCounter(TTP_Train[i][5:-1],lang,TTP)
        TTP=wc[3]#???
       
        if lang in TTP['language']:

            TTP['language'][lang]['Dict'     ] = addDict(TTP['language'][lang]['Dict'],wc[1])
            TTP['language'][lang]['Words'    ]+= len(wc[0])      #List of all words in row
            TTP['language'][lang]['Phrases'  ]+= wc[2]           #Total of all sentences in row
            
        else :

            TTP['language'][lang] = {'Dict':wc[1],'Words':len(wc[0]),'Phrases':wc[2]}

## Setting up averages on the previous variables

In [13]:
#Processing all text inputs in all partitions
for TTP in Train_Test:
    for l in TTP['language']:
        w_count    = TTP['language'][l]['Words']
        p_count    = TTP['language'][l]['Phrases']
        d_distinct = len(TTP['language'][l]['Dict'])
        TTP['language'][l]['Distinct']= d_distinct
        
        TTP['language'][l]['avg_WP']= round(w_count/p_count,2)
        TTP['language'][l]['avg_DW']= round(d_distinct/w_count,2)
        TTP['language'][l]['avg_DP']= round(d_distinct/p_count,2)

## Preview of some statistical date for set of languages in each Portion

In [14]:
#Enriching TTPs with mean and orther stats
for TT in Train_Test:
    #Taille Moyenne des phrase en mots
    print("Partition", TT['train'],'\n')
    for l in TT['language']:
        
        print(l,
              "Total Words:",   TT['language'][l]['Words'],
              "Distinct Words:",TT['language'][l]['Distinct'],
              "Total Phrases:", TT['language'][l]['Phrases'],
              "AVG-WP:",TT['language'][l]['avg_WP'],
              "AVG WD:",TT['language'][l]['avg_DP'],
              "Ratio DW:",TTP['language'][l]['avg_DW'])
    print("\n")

Partition [(0, 1979), (1980, 3959), (3960, 5939)] 

GER Total Words: 168567 Distinct Words: 8258 Total Phrases: 8732 AVG-WP: 19.3 AVG WD: 0.95 Ratio DW: 0.05
TUR Total Words: 166726 Distinct Words: 8506 Total Phrases: 9488 AVG-WP: 17.57 AVG WD: 0.9 Ratio DW: 0.05
CHI Total Words: 166209 Distinct Words: 9072 Total Phrases: 9396 AVG-WP: 17.69 AVG WD: 0.97 Ratio DW: 0.05
TEL Total Words: 175638 Distinct Words: 9943 Total Phrases: 8957 AVG-WP: 19.61 AVG WD: 1.11 Ratio DW: 0.06
ARA Total Words: 151002 Distinct Words: 10253 Total Phrases: 6685 AVG-WP: 22.59 AVG WD: 1.53 Ratio DW: 0.07
SPA Total Words: 167930 Distinct Words: 9224 Total Phrases: 7208 AVG-WP: 23.3 AVG WD: 1.28 Ratio DW: 0.05
HIN Total Words: 183841 Distinct Words: 10634 Total Phrases: 9288 AVG-WP: 19.79 AVG WD: 1.14 Ratio DW: 0.06
JPN Total Words: 150055 Distinct Words: 8017 Total Phrases: 9646 AVG-WP: 15.56 AVG WD: 0.83 Ratio DW: 0.05
KOR Total Words: 161829 Distinct Words: 8680 Total Phrases: 10145 AVG-WP: 15.95 AVG WD: 0.86 

### Setting up dirty words for each language of each Portion

In [15]:
for TT in Train_Test:
    
    langs = list(TT['language'].keys())
    size  = len(langs)

    for x in range(size):
        for y in range(size):
            if x != y:
                TT['language'][langs[x]]['D'] = extract(TT['language'][langs[x]]['Dict'],TT['language'][langs[y]]['Dict'])

## Inegrity test between global dictionnary and language dictionarry for each Portion

In [16]:
for TT in Train_Test:
    #Testing acuracy of both dictionaries 
    
    for word in TT['words'].keys() :
        for lang in TT['words'][word]['language'].keys():
            if word not in TT['language'][lang]['Dict']:
                print(TT['train'],word)
                break

In [17]:
# Calculates statistical information about a text input. 
#  TT     : Portion containing Train, Test, Words, Language ... 
#  Tokens : results of function Tokens(input_text)
#  Debug  : Boolean to preview or not logs from stats when running

def stats(TT,tokens,debug):
         
    langStats = {
        
        'FRE':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'ARA':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'CHI':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'TUR':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'TEL':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'GER':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'JPN':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'HIN':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'SPA':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'KOR':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 },
        'ITA':{ 'pure':0, 'global':0, 'dirty':0, 'unknown':0 }
    }
     
    for each in langStats :
        
        D = TT['language'][each]['Dict']
        D_normalizer = len(tokens)*len(D)
        W_normalizer = len(tokens)*len(TT['words'])
        
        for w in tokens:
            
            if w in D:
                
                if D[w][1]=='Pure':
                    langStats[each]['pure']+=round(1/D_normalizer,10)
                    
                elif D[w][1]=='Dirty':
                    
                    current = D[w][0]
                    others  = TT['words'][w]['occurences']-current
            
                    occ = (current/(others/10))/D_normalizer
                    langStats[each]['dirty']+= round(occ,10)
                        
            elif w in TT['words']:
                langStats[each]['global']   +=round(1/W_normalizer, 10)
            
        if debug :
            print(each,langStats[each])
        
    return langStats

In [18]:
# Prediction minimisation function
# T : Text 
# TTP : Portion
# debug : boolean
# returns language with best probabilitie sfor a give input text.

def box(t,TTP,debug):
     
    langStats = stats(TTP,tokenize(t)[0],debug)
        
    return min(langStats.keys(), key=(lambda k: (1/(langStats[k]['pure']+1))*(langStats[k]['global']+1)/(langStats[k]['dirty']+1)))

In [19]:
# Evaluate the score of the train model on test text samples
# returns a score in %

def score (TT): 
    
    Test = portionsToSubtext(text,TT['test'])
    acc = 0
    failed = []
    L = len(Test)
    for i in range(L):

        T=Test[i]
        trueLang = T[1:4]

        predictedLang = box(T[5:-1],TT,False)
        if predictedLang != trueLang:
            #print(i,trueLang,predictedLang)
            failed.append(i)

        if predictedLang == trueLang:
            acc+=1/L
    return acc,failed

In [20]:
# Evaluation model on all test Portions
for TT in Train_Test:
    TT['score'],TT['failed']=score(TT)

In [21]:
# Displaying score for each test Portion
i = 1
for each in Train_Test:
    print("Portion",i, 'score',round(each['score']*100,2),'%')
    i+=1

Portion 1 score 51.41 %
Portion 2 score 50.35 %
Portion 3 score 50.76 %
Portion 4 score 48.94 %


## Using Random Forest

In [22]:
#Function use to create feature of random forest model 
# i   : Not used
# T   : Text
# TTP : current Portion

def createFeature(i,T,TTP):
    
    
    tk = tokenize(T[5:-1])
    Stats = stats(TTP,tk[0],False)
    
    w   = len(tk[0])
    d   = len(tk[1])
    p   =     tk[2]
    
    dp  = round(d/p,2)
    wp  = round(w/p,2)
    dw  = round(d/w,2)
    wd  = round(w/d,2)
    
    features =[w,d,p,dp,wp,dw,wd]
      
    for s in Stats:
        
        P = Stats[s]['pure']  +0.1
        D = Stats[s]['dirty'] +0.1
        G = Stats[s]['global']+0.1
        
        features.append(P)
        features.append(G)     
        features.append(G/D)
        features.append(D)
        features.append((1/P)*(G/D))
        
    return features

## Random Forest Classification with Cross Validation

In [23]:
# Training all Portions using RF classification 

for TT in Train_Test:
    
    Train = portionsToSubtext(text,TT['train'])
    Test  = portionsToSubtext(text,TT['test' ])
    
    X_train =[] 
    y_train =[]

    for i in range(len (Train)):
        y_train.append(Train[i][1:4])
        X_train.append(createFeature(i,Train[i],TT))
        
    X_test =[] 
    y_test =[]

    for i in range(len(Test)):
        y_test.append(Test[i][1:4])
        X_test.append(createFeature(i,Test[i],TT))

    clf=RandomForestClassifier(n_estimators=1000)
    clf.fit(X_train,y_train)
    
    y_pred=clf.predict(X_test)
    TT['score']=metrics.accuracy_score(y_test, y_pred)

In [24]:
# Displaying scores for each Portion after training

i = 1
for each in Train_Test:
    print("Random Forest Classification with Cross Validation Portion",i, 'score',round(each['score']*100,2),'%')
    i+=1

Random Forest Classification with Cross Validation Portion 1 score 42.47 %
Random Forest Classification with Cross Validation Portion 2 score 40.4 %
Random Forest Classification with Cross Validation Portion 3 score 41.67 %
Random Forest Classification with Cross Validation Portion 4 score 41.11 %


## Predicting using the best model (Dirty, Pure, Global) & the highest scored Portion

In [25]:
def export(predictionFunction, tagName = None):
    
    BEST_TRAINED_PARTITION = max( Train_Test, key=lambda i:i['score'])
    #print(round(BEST_TRAINED_PARTITION['score']*100,2),'%')
    
    export = open("EPSI M2-ISD Valery & Charles predictions.txt", "wt")
    file   = open("test.txt","r+")
    
    for line in file:
        export.write('('+ (predictionFunction(line[5:-1],BEST_TRAINED_PARTITION,False))+')'+line[5:-1]+'\n')
        
    export.close()
    file.close()

In [26]:
#To Create Inline Prediction File
export(box)