In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re
import pandas as pd
import numpy as np

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

In [81]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

In [82]:
train_df = pd.read_csv(train_path, index_col=0)
train_df['target'] = train_df.genres.apply(lambda x: re.findall("(?<=')[\w,-]+(?=')", x))

In [98]:
stop_words = set(stopwords.words('english'))
stemmer = WordNetLemmatizer()

In [1]:
def clean_text(text):
    text = re.sub("<BR>", "", text)
    # dot between F.B.I.
    text = re.sub("\.", "", text)
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Converting to Lowercase
    text = text.lower()

    tokens = word_tokenize(text)
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [100]:
train_df['dialogue_prep'] = train_df['dialogue'].apply(lambda x: clean_text(x))

In [101]:
X = train_df['dialogue_prep']

multi_bin = MultiLabelBinarizer()
y = multi_bin.fit_transform(train_df['target'])

In [102]:
tf_idf_vect = TfidfVectorizer(max_features=20000)

In [116]:
def convert_y(y_pred, n_max=6):
    y_new = np.zeros(y_pred.shape)
    for ind in range(y_pred.shape[0]):
        bound = sum(sorted(y_pred[ind])[::-1][:n_max]) / n_max
        y_new[ind] = (y_pred[ind] > bound).astype(int)
    return y_new

def scorer(est, X_test, y_test):
    y_test_proba = est.predict_proba(X_test)
    y_pred = convert_y(y_test_proba)
    return f1_score(y_pred, y_test, average='samples')

In [117]:
X_all = tf_idf_vect.fit_transform(X)

In [36]:
params_list = [{'C': 1.5}, {'C': 2.5}, {'C': 3.5}]

for params in params_list:
    lr = LogisticRegression(**params,)
    clf = OneVsRestClassifier(lr)
    score = cross_val_score(clf, X_all, y, scoring=scorer, cv=5)
    print(f"For {params}, score mean= {score.mean():.3f}, std={score.std():.3f}")

For {'C': 1.5}, score mean= 0.654, std=0.003
For {'C': 2.5}, score mean= 0.656, std=0.003
For {'C': 3.5}, score mean= 0.656, std=0.004


In [108]:
lr = LogisticRegression(C=3.)
clf = OneVsRestClassifier(lr)
clf.fit(X_all, y)

OneVsRestClassifier(estimator=LogisticRegression(C=3.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

# scoring

In [109]:
test_df = pd.read_csv(test_path, index_col=0)
test_df['dialogue_prep'] = test_df['dialogue'].apply(lambda x: clean_text(x))
X_test_tfidf = tf_idf_vect.transform(test_df['dialogue_prep'])

In [110]:
y_test_proba = clf.predict_proba(X_test_tfidf)
y_test = convert_y(y_test_proba)
test_df['target'] = multi_bin.inverse_transform(y_test)
test_df['genres'] = test_df['target'].apply(lambda x: ' '.join(x))
test_df[['genres']].to_csv('submit.csv')

## check zero predict

In [111]:
(y_test.sum(axis=1) == 0).sum()

0

In [112]:
!head submit.csv

id,genres
0,crime drama
1,drama thriller
2,drama
3,drama romance
4,action thriller
5,drama romance thriller
6,comedy romance
7,comedy drama romance
8,drama romance


In [23]:
!head submit.csv

id,genres
0,crime drama
1,drama thriller
2,drama
3,drama romance
4,action thriller
5,drama romance thriller
6,comedy drama romance
7,comedy drama romance
8,drama romance
