In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importación librerías
import pandas as pd
import os
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer, PorterStemmer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
import string

In [3]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [4]:
# Visualización datos de entrenamiento
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [5]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,"who meets by fate , shall be sealed by fate ...."
4,1978,Midnight Express,"the true story of billy hayes , an american c..."
5,1996,Primal Fear,martin vail left the chicago da ' s office to ...
6,1950,Crisis,husband and wife americans dr . eugene and mr...
7,1959,The Tingler,the coroner and scientist dr . warren chapin ...


In [6]:
def clean_text(text):
    text = re.sub((r'[^\w\s]'),'', text).lower() 
    text = re.sub((r'\d+'),'', text).lower()
    text = re.sub((r'_+'),'', text).lower()
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    words = re.sub(r'[^\w\s]', ' ', text).split()
    return ' '.join([wnl.lemmatize(word) for word in words if word not in stopwords])

In [7]:
dataTraining['clean_plot'] = dataTraining['plot'].apply(clean_text)

In [8]:
# Definición de variable de interés (y)
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))
le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

In [9]:
#transform the genre_new column to a series of columns with binary values
binary_labels=pd.DataFrame(le.fit_transform(dataTraining['genres']),columns=le.classes_) 

#order columns alphabetically
binary_labels=binary_labels.sort_index(axis=1) 

binary_labels.tail()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
7890,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7891,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7892,0,1,0,0,1,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
7893,0,1,1,0,0,0,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0
7894,0,1,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Definición de variables predictoras (X)
vect = CountVectorizer()
X_dtm = vect.fit_transform(dataTraining['clean_plot'])
X_dtm.shape

(7895, 34451)

In [11]:
# Separación de variables predictoras (X) y variable de interés (y) en set de entrenamiento y test usandola función train_test_split
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.33, random_state=42)

In [12]:
X_train.shape[0]

5289

In [13]:
X_test.shape[0]

2606

In [14]:
base_classifier = XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3)

classifier = OneVsRestClassifier(base_classifier)

classifier.fit(X_train, y_train_genres)

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                       

In [15]:
# Predicción del modelo de clasificación
y_pred_genres = classifier.predict_proba(X_test)

# Impresión del desempeño del modelo
roc_auc_score(y_test_genres, y_pred_genres, average='macro')

0.8467548461655173

# Regresión Lineal

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
clf.fit(X_train, y_train_genres)

In [None]:
y_pred_genres_lr = clf.predict_proba(X_test)

In [None]:
roc_auc_score(y_test_genres, y_pred_genres_lr, average='macro')

# tfidf vectorizer

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

tfidf_vectorizer = TfidfVectorizer() #(max_df=0.8, max_features=10000)

In [20]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(dataTraining['clean_plot'], y_genres, test_size=0.20, random_state=9)

In [21]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(X_train)
xtest_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(xtrain_tfidf, y_train_genres)

# make predictions for validation set
y_pred_genres_lr = clf.predict_proba(xtest_tfidf)

In [25]:
roc_auc_score(y_test_genres, y_pred_genres_lr, average='macro')

0.8821186614888777

In [29]:
X_test_dtm = tfidf_vectorizer.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

# Predicción del conjunto de test
y_pred_test_genres = clf.predict_proba(X_test_dtm)

In [30]:
# Guardar predicciones en formato exigido en la competencia de kaggle
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_RF_v2.csv', index_label='ID')
res.head()

Unnamed: 0,p_Action,p_Adventure,p_Animation,p_Biography,p_Comedy,p_Crime,p_Documentary,p_Drama,p_Family,p_Fantasy,...,p_Musical,p_Mystery,p_News,p_Romance,p_Sci-Fi,p_Short,p_Sport,p_Thriller,p_War,p_Western
1,0.158513,0.133796,0.03514,0.040765,0.381704,0.118596,0.037962,0.48096,0.087187,0.105016,...,0.033917,0.107319,0.000937,0.34457,0.070367,0.011257,0.027839,0.195897,0.030019,0.029013
4,0.132533,0.072546,0.035444,0.099672,0.417112,0.170419,0.058378,0.612866,0.068051,0.05238,...,0.04057,0.048329,0.001017,0.126381,0.034387,0.0109,0.035874,0.202329,0.054394,0.025752
5,0.126903,0.058488,0.020414,0.053618,0.260302,0.491105,0.041516,0.602027,0.042482,0.052979,...,0.023514,0.209161,0.001042,0.145062,0.076628,0.009533,0.026792,0.438735,0.033703,0.02351
6,0.122325,0.081977,0.024388,0.047432,0.248691,0.089474,0.03896,0.547766,0.058907,0.057058,...,0.033568,0.085883,0.000938,0.194343,0.106602,0.00921,0.031657,0.235299,0.054245,0.023249
7,0.092911,0.082768,0.032736,0.03169,0.24178,0.110788,0.041222,0.207287,0.054298,0.110465,...,0.024758,0.146533,0.000934,0.132055,0.39514,0.009962,0.017701,0.417299,0.020443,0.021551


# Gaussian Naibe vayes

In [32]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB


# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
clf = BinaryRelevance(GaussianNB())
# train
clf.fit(xtrain_tfidf, y_train_genres)
# predict
y_pred_genres_NB = clf.predict_proba(xtest_tfidf)

In [33]:
roc_auc_score(y_test_genres, y_pred_genres_lr, average='macro')

0.8821186614888777