# Challenge T2 - Group 19

Content:
1. Text stemming
2. Clustering and interpretation
3. Classification and prediction

In [1]:
"""
It takes one or two hours to finish the clustering with all the data,
if you want to test the code,
just take like the first 3000 or so ones.
"""

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer
import string
import stop_words
from sklearn.cluster import KMeans
import pandas as pd

"""
To use the two packages of nltk, do 
nltk.download("stopwords")
and
nltk.download('punkt')
in the python console
after you've installed nltk
if you don't want certain word that appears but not filtered by me,
add it in the list ['', '’', '``', '\'\'', '»', '...','«', 'nan', '--']
in the 3rd line of the get_stop_words() function
"""

def get_stop_words():
    custom_stop_words = set(stopwords.words('french') +
                            list(string.punctuation) +
                            ['', '’', '``', '\'\'', '»', '...','«', 'nan', '--'] +
                            stop_words.get_stop_words('fr'))
    return custom_stop_words


def tokenize(text):
    stemmer = FrenchStemmer()
    words_temp = word_tokenize(text, language='french')
    words_no_prefix = [f[2:] if f.startswith(("l\'","d\'","j\'","n\'","c\'")) else f for f in words_temp]
    words_no_prefix = [f[3:] if f.startswith(("qu\'")) else f for f in words_no_prefix]
    words_prefect = [stemmer.stem(word) for word in words_no_prefix if not word.isdigit()]
    return words_prefect

## 1. Stemming

In [2]:
file_names = ["DEMOCRATIE_ET_CITOYENNETE.csv", "LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.csv",
            "LA_TRANSITION_ECOLOGIQUE.csv", "ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.csv"]
file_name = file_names[0] # name of the file

df = pd.read_csv("data/" + file_name, low_memory=False)
n_questions= df.shape[1] - 11 # 11 features are basic information
n_answers = df.shape[0]

#Use this of you only want to do one certain question

'''
answers = df["QUXVlc3Rpb246MTA3 - En qui faites-vous le plus confiance pour vous faire représenter dans la société et pourquoi ?"]
answers = answers.str.lower()
answers = answers.values.tolist()
answers = [x for x in answers if type(x) is str]
'''

#Use this if you want to combine the responses of all the questions
answers = df.iloc[:,11:]
answers = answers.astype(str)
answers = answers.apply(" ".join, axis =1)
answers = answers.str.lower()
answers = answers.values.tolist()

In [3]:
#Vectorize our text
custom_stop_words = get_stop_words()
vectorizer = TfidfVectorizer(stop_words=custom_stop_words,
                            tokenizer=tokenize,
                            max_features=100)

In [4]:
'''
The X in the next line is the matrix transformed from all the text data,
if you wants to split the data to train/text or just divide,
split the X
'''

X = vectorizer.fit_transform(answers)
words = vectorizer.get_feature_names()   

  'stop_words.' % sorted(inconsistent))


## 2. Clustering

In [5]:
#train the cluster
kmeans = KMeans(n_clusters=4, n_init=20)
kmeans.fit(X)

#Display the clustering results
common_words = kmeans.cluster_centers_.argsort()[:, -1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : dan, franc, plus, droit, loi, an, san, citoyen, comm, cel, person, faut, fair, national, ser, vot, autr, vi, grand, mêm, non, devr, enfant, débat, polit
1 : dan, plus, oui, franc, faut, fair, non, respect, pay, person, citoyen, autr, vot, cel, comm, polit, notr, bien, aid, mêm, gen, san, élus, immigr, vi
2 : citoyen, dan, oui, plus, vot, élus, non, polit, représent, local, associ, fair, faut, élect, national, assembl, respect, déput, particip, mandat, comm, part, commun, pouvoir, autr
3 : oui, non, vot, plus, chos, dan, respect, obligatoir, élus, citoyen, sais, proportionnel, déput, mair, fair, supprim, franc, blanc, référendum, immigr, person, travail, polit, comm, associ


In [6]:
y = kmeans.labels_ 

In [15]:
# labels from clustering
# transform labels to onehot code

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
y_1hot = cat_encoder.fit_transform(y.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## 3. Classification

In [21]:
# split into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y_1hot, test_size=0.2, random_state=42)

In [31]:
'''
STEP 2 with Decision tree
'''
from sklearn import tree

clf_DT = tree.DecisionTreeClassifier()
clf_DT = clf_DT.fit(X_train, y_train.toarray())

In [32]:
clf_DT.score(X_test, y_test.toarray())

0.7352643305651205

In [None]:

'''
STEP 2 with Naive Bayes
'''
from sklearn.naive_bayes import MultinomialNB

clf_NB = MultinomialNB()
clf_NB.fit(X, y)

'''
STEP 2 with KNN
'''
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors = 8)
neigh.fit(X, y)

'''
STEP 2 train report
'''
from sklearn import metrics

y_predicted = neigh.predict(X)
print(metrics.classification_report(y, y_predicted))

#TODO
#Use the STEP1 cluster and the STEP2 cluster to predict the same
#block of data, use the result of the cluster as the correct one
#track the performance. Or you can change the parameters as you wish

#TODO IMPORTANT
#observe the clustering result and try to find a meaningful representation
#of each cluster like "people with a negative view", "people who don't believe
# democracy", etc. Like this our report will be more meaningful.