# Explore here

In [17]:
import pandas as pd
import regex as re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import *
from imblearn.metrics import specificity_score
import numpy as np
from sklearn.model_selection import GridSearchCV
import joblib

In [5]:
df = pd.read_csv("hf://datasets/sepidmnorozy/English_sentiment/train.csv")
df

Unnamed: 0,label,text
0,1,The Rock is destined to be the st Century 's n...
1,1,The gorgeously elaborate continuation of `` Th...
2,1,Singer\/composer Bryan Adams contributes a sle...
3,1,Yet the act is still charming here .
4,1,Whether or not you 're enlightened by any of D...
...,...,...
6915,0,A real snooze .
6916,0,No surprises .
6917,1,We 've seen the hippie-turned-yuppie plot befo...
6918,0,Her fans walked out muttering words like `` ho...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   6920 non-null   int64 
 1   text    6920 non-null   object
dtypes: int64(1), object(1)
memory usage: 108.3+ KB


In [9]:
df=df.drop_duplicates()
df

Unnamed: 0,label,text
0,1,The Rock is destined to be the st Century 's n...
1,1,The gorgeously elaborate continuation of `` Th...
2,1,Singer\/composer Bryan Adams contributes a sle...
3,1,Yet the act is still charming here .
4,1,Whether or not you 're enlightened by any of D...
...,...,...
6915,0,A real snooze .
6916,0,No surprises .
6917,1,We 've seen the hippie-turned-yuppie plot befo...
6918,0,Her fans walked out muttering words like `` ho...


In [12]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [15]:
def preprocess_text(text):
    # Eliminar cualquier caracter que no sea una letra (a-z) o un espacio en blanco ( )
    text = re.sub(r'[^a-z ]', " ", text)
    
    # Eliminar espacios en blanco
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Reducir espacios en blanco múltiples a uno único
    text = re.sub(r'\s+', " ", text.lower())

    # Eliminar tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

    return text.split()

In [16]:
df['text']=df['text'].apply(preprocess_text)
df

Unnamed: 0,label,text
0,1,"[he, ock, is, destined, to, be, the, st, entur..."
1,1,"[he, gorgeously, elaborate, continuation, of, ..."
2,1,"[inger, composer, ryan, dams, contributes, sle..."
3,1,"[et, the, act, is, still, charming, here]"
4,1,"[hether, or, not, you, re, enlightened, by, an..."
...,...,...
6915,0,"[real, snooze]"
6916,0,[surprises]
6917,1,"[ve, seen, the, hippie, turned, yuppie, plot, ..."
6918,0,"[er, fans, walked, out, muttering, words, like..."


In [18]:
download("wordnet")
lemmatizer = WordNetLemmatizer()

download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luisC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def lemmatize_text(words, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

In [20]:
df['text']=df['text'].apply(lemmatize_text)
df

Unnamed: 0,label,text
0,1,"[destined, entury, onan, going, make, splash, ..."
1,1,"[gorgeously, elaborate, continuation, ings, tr..."
2,1,"[inger, composer, ryan, contributes, slew, son..."
3,1,"[still, charming]"
4,1,"[hether, enlightened, errida, lecture, self, e..."
...,...,...
6915,0,"[real, snooze]"
6916,0,[surprise]
6917,1,"[seen, hippie, turned, yuppie, plot, enthusias..."
6918,0,"[walked, muttering, word, like, horrible, terr..."


In [24]:
tokens_list = df["text"]
tokens_list = [" ".join(tokens) for tokens in tokens_list]

vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = df["label"]
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.76219263,
        0.        ]])

In [42]:
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [26]:
model = SVC(kernel = "linear", random_state = 42)
model.fit(X_train, y_train)

In [27]:
y_pred_test=model.predict(X_test)
y_pred_train=model.predict(X_train)


In [30]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [31]:
get_metrics(y_train,y_test,y_pred_train,y_pred_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.874096,0.876639,0.874313,0.886062,0.867415,0.88121
Test,0.767173,0.780952,0.767871,0.802797,0.760265,0.775478
Diferencia,0.106923,0.095687,0.106441,0.083265,0.10715,0.105733


In [32]:


param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "kernel": ["linear", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],  
    "class_weight": [None, "balanced"]  
}


grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5,)
grid_search

In [33]:
grid_search.fit(X_train,y_train)
grid_search.best_params_

{'C': 1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}

In [34]:
grid_search.best_estimator_

In [35]:
best_model = SVC(C=1, class_weight=None, gamma='scale', kernel='rbf', random_state=42)


In [38]:
best_model.fit(X_train,y_train)

In [39]:
y_pred_best_test=best_model.predict(X_test)
y_pred_best_train=best_model.predict(X_train)


In [40]:
get_metrics(y_train,y_test,y_pred_best_train,y_pred_best_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.972504,0.973069,0.972807,0.983172,0.963171,0.982443
Test,0.761388,0.777027,0.761368,0.793103,0.761589,0.761146
Diferencia,0.211115,0.196042,0.211439,0.190069,0.201581,0.221297


Este modelo presenta mayor sobreajuste con respecto al anterior

In [41]:
joblib.dump(model, 'model.pkl')


['model.pkl']