In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [13]:
PROCESSED_DATASET_FOLDER = '../data/processed'
DATASET_CLEAN_LOCATION = f'{PROCESSED_DATASET_FOLDER}/Language Detection Clean.csv'
DATASET_CLEAN_UNDERSAMPLING_LOCATION = f'{PROCESSED_DATASET_FOLDER}/Language Detection Clean Undersampling.csv'

In [14]:
df = pd.read_csv(DATASET_CLEAN_LOCATION)
df_undersampling = pd.read_csv(DATASET_CLEAN_UNDERSAMPLING_LOCATION)

In [15]:
class Datasets:
    
    def __init__(self, X, y, X_t, y_t, text, text_t):
        self.X = X
        self.y = y
        self.X_t = X_t
        self.y_t = y_t
        self.text = text
        self.text_t = text_t
        
def split_dataset(df: pd.DataFrame, vectorizer: CountVectorizer) -> Datasets: 
    X = vectorizer.fit_transform(df['Text'])
    y = LabelEncoder().fit_transform(df['Language'])
    X, X_t, y, y_t, text, text_t = train_test_split(
        X, y, df['Text'].values ,test_size=0.2, stratify=y, random_state=1999
    )
    return Datasets(X, y, X_t, y_t, text, text_t)

df_bow = split_dataset(df, CountVectorizer())
df_bow_undersampling = split_dataset(df_undersampling, CountVectorizer())
df_tfidf = split_dataset(df, TfidfVectorizer())
df_tfidf_undersampling = split_dataset(df_undersampling, TfidfVectorizer())

In [None]:
def naive_bayes(datasets: Datasets):
    nb = MultinomialNB()
    nb.fit(datasets.X, datasets.y)
    y_pred = nb.predict(datasets.X_t)
    print("Naive Bayes:", accuracy_score(datasets.y_t, y_pred))
    
    wrong_idx = datasets.y_t != y_pred
    errors_df = pd.DataFrame({
        'Text': datasets.text_t[wrong_idx],         
        'True Label': datasets.y_t[wrong_idx],
        'Predicted Label': y_pred[wrong_idx]
    })
    print(errors_df)
    return nb
    
def logistic_regression(datasets: Datasets):
    param_grid = {
        'C': [0.01, 0.1, 1, 10],  # forza della regolarizzazione
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }
    grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
    grid.fit(datasets.X, datasets.y)
    
    y_pred = grid.predict(datasets.X_t)
    print("Logistic Regression:", accuracy_score(datasets.y_t, y_pred))    
    
    wrong_idx = datasets.y_t != y_pred
    errors_df = pd.DataFrame({
        'Text': datasets.text_t[wrong_idx],         
        'True Label': datasets.y_t[wrong_idx],
        'Predicted Label': y_pred[wrong_idx]
    })
    print(errors_df)
    
    return grid
    
    

In [24]:
nb_bow = naive_bayes(df_bow)
nb_bow_und = naive_bayes(df_bow_undersampling)
nb_tfidf= naive_bayes(df_tfidf)
nb_tfidf_und = naive_bayes(df_tfidf_undersampling)

Naive Bayes: 0.9952038369304557
                    Text  True Label  Predicted Label
0         autorizzazione           1                0
1  interrupting politely           0                1
Naive Bayes: 0.9928057553956835
          Text  True Label  Predicted Label
0   scusandosi           1                0
1  progettando           1                0
Naive Bayes: 0.9952038369304557
              Text  True Label  Predicted Label
0   autorizzazione           1                0
1  lasciami finire           1                0
Naive Bayes: 0.9892086330935251
                    Text  True Label  Predicted Label
0             hesitating           0                1
1  i m afraid i disagree           0                1
2            suggestions           0                1


In [25]:
lr_bow = logistic_regression(df_bow)
lr_bow_und = logistic_regression(df_bow_undersampling)
lr_tfidf = logistic_regression(df_tfidf)
lr_tfidf_und = logistic_regression(df_tfidf_undersampling)

4
Logistic Regression: 0.988009592326139
                                                Text  True Label  \
0                                         permission           0   
1                                        suggestions           0   
2                              interrupting politely           0   
3  cells within colonies became increasingly spec...           0   
4                          in japanese copyright law           0   

   Predicted Label  
0                1  
1                1  
2                1  
3                1  
4                1  
4
Logistic Regression: 0.9784172661870504
                                                Text  True Label  \
0        image files varies across language editions           0   
1  cells within colonies became increasingly spec...           0   
2                                         hesitating           0   
3                              i m afraid i disagree           0   
4                                        s

In [None]:
# Decido di salvare solo le combinazioni dei due modelli con bow e tfidf