# Preprocessing Speech (Chirac/Mitterrand)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from utils import *

## Load text

alllabs: 
- 1: Chirac
- -1: Mitterrand

In [2]:
FILE_NAME = "../../dataset/corpus.tache1.learn.utf8"

alltxts, alllabs = load_pres(FILE_NAME)

print(len(alltxts))
print(alltxts[10])
print(alllabs[10])

57413
 A Brazzaville, que l'Afrique de demain se dessine.

1


## Remove Numbers


In [3]:
processed_txts = remove_numbers(alltxts)

In [4]:
uninformative_words = find_uninformative_words(processed_txts, alllabs, threshold=0.95)
len(uninformative_words)

Top uninformative words (appear equally in both classes):
secteurs            : ratio=1.000, class1=0.0025, class2=0.0025
avec                : ratio=1.000, class1=0.0856, class2=0.0856
annee               : ratio=0.999, class1=0.0128, class2=0.0128
periode             : ratio=0.999, class1=0.0029, class2=0.0029
equipements         : ratio=0.999, class1=0.0015, class2=0.0015
contact             : ratio=0.999, class1=0.0011, class2=0.0011
exercer             : ratio=0.998, class1=0.0019, class2=0.0019
chances             : ratio=0.998, class1=0.0037, class2=0.0037
plein               : ratio=0.998, class1=0.0017, class2=0.0017
troisieme           : ratio=0.997, class1=0.0027, class2=0.0027
britannique         : ratio=0.995, class1=0.0007, class2=0.0007
siege               : ratio=0.995, class1=0.0007, class2=0.0007
complementarite     : ratio=0.995, class1=0.0007, class2=0.0007
calendrier          : ratio=0.995, class1=0.0007, class2=0.0007
ancien              : ratio=0.995, class1=0.00

231

## Vectorize with TfidfVectorizer

**TF-IDF**: words can also be weighted by importance.  
Corpus: $C = \{\mathbf d_{1}, \ldots, \mathbf d_{|C|}\}$, vocabulary: $V = \{\mathbf w_{1}, \ldots, \mathbf w_{|V|}\}$:

- $\mathbf{d}_{ik}^{(tf)}$ term frequency for word $w_k$ in document $d_i$, s.t. $\sum_{k=1}^{|V|} d_{ik}^{(tf)} = 1$  
- $\mathrm{df}_{k}$ document frequency: $\mathrm{df}_{k} = \frac{|\{\mathbf d : w_{k} \in \mathbf d\}|}{|C|}$

TF-IDF for word $w_k$ in document $d_i$:

$$
d_{ik}^{(tfidf)} = d_{ik}^{(tf)} \, \log \frac{1}{\mathrm{df}_{k}}
$$

**Main parameters:**
- **use_idf:** boolean, default=True.  
- **smooth_idf:** Smooth idf weights, default=True. Adds one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents division by zero.  
- **sublinear_tf:** boolean, default=False. Apply sublinear tf scaling, i.e., replace $d_{ik}^{(tf)}$ with $1 + \log(d_{ik}^{(tf)})$.

In [5]:
# Get French stopwords as a list
french_stopwords = stopwords.words('french') + uninformative_words
# Remove their accents
french_stopwords_no_accents = [remove_accents(word) for word in french_stopwords]
#french_stopwords_stemmed = stemming_french(french_stopwords_no_accents)

# ATTENTION: one sentence = one doc
vectorizer = TfidfVectorizer(
    lowercase=True,           # handles capitalization
    stop_words=french_stopwords_no_accents,     # removes stop words (Pass the stop words list)
    max_df=0.80,             # ignore terms in >95% of docs
    min_df=15,                # ignore terms in <2 docs
    ngram_range=(1, 2),       # unigrams + bigrams,
    strip_accents='unicode'  # handles accents
)

## Stemming (Optional)
Might hurt performance, test if needed

In [6]:
#processed_txts = stemming_french(alltxts)

In [7]:
X = vectorizer.fit_transform(processed_txts)

**n_features is vocaburary**: unique words across all documents

In [8]:
# X is a sparse matrix
print("Shape of X:", X.shape)  # (n_documents, n_features)
feature_names = vectorizer.get_feature_names_out()
print(feature_names[:10])

Shape of X: (57413, 6990)
['abandon' 'abandonner' 'abord' 'abord avant' 'abord dire'
 'abord monsieur' 'abord parce' 'abord remercier' 'abord saluer' 'aborde']


## Train / test split

In [9]:
from sklearn.model_selection import train_test_split

rs=10
[X_train, X_test, y_train, y_test]  = train_test_split(X, alllabs, test_size=0.2, random_state=rs, shuffle=True)


print(X_train.shape)
print(X_test.shape)
print(len(y_train))


(45930, 6990)
(11483, 6990)
45930


## Try on three models
- Na誰ve bayes
- Logistic Regression
- SVM

For now just fit each model below with default parameters

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


#Na誰ve Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)


#Logistic Regression
t = 1e-8
C=100.0
lr_clf = LogisticRegression(random_state=0, solver='liblinear',max_iter=100, tol=t, C=C)
lr_clf.fit(X_train, y_train)

#Linear SVM
svm_clf = LinearSVC(random_state=0)
svm_clf.fit(X_train, y_train)

pred_nbt = nb_clf.predict(X_train)
pred_lrt = lr_clf.predict(X_train)
pred_svmt = svm_clf.predict(X_train)

pred_nb = nb_clf.predict(X_test)
pred_lr = lr_clf.predict(X_test)
pred_svm = svm_clf.predict(X_test)


print(f"Na誰ve Bayes accuracy train={accuracy_score(y_train, pred_nbt)}, accuracy test={accuracy_score(y_test, pred_nb)}")
print(f"Logistic Regression accuracy train={accuracy_score(y_train, pred_lrt)}, accuracy test={accuracy_score(y_test, pred_lr)}")
print(f"SVM accurac ytrain={accuracy_score(y_train, pred_svmt)}, accuracy test={accuracy_score(y_test, pred_svm)}")

Na誰ve Bayes accuracy train=0.8860004354452428, accuracy test=0.8807802838979361
Logistic Regression accuracy train=0.9408447637709558, accuracy test=0.8810415396673343
SVM accurac ytrain=0.930089266274766, accuracy test=0.8982844204476182
