In [1]:
import re
import time

import pandas as pd
import numpy as np
import nltk
import warnings
import pickle
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning
                        )

In [2]:
import data_cleaner

In [3]:
df_train = pd.read_csv('data/train.csv')
df_t = df_train.copy()

In [4]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
df_train['clean_tweet'] = np.vectorize(data_cleaner.clean_mentions)(df_train['tweet'])
df_train['clean_tweet'] = df_train['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
df_train['clean_tweet'] = df_train['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


df_train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional and selfish drags hi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit can use cause they don...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model love take with all the time
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [6]:
from data_transformation import TfidfDataTransformer
from sklearn.model_selection import train_test_split

tr= TfidfDataTransformer()

df_train['clean_tweet'] = tr.stemming(df_train['clean_tweet'])
display(df_train.head())
df_train['clean_tweet'].to_csv('../models/model_data.csv', sep=',')
tr.vectorizer_fit(df_train['clean_tweet'])
df_tfidf = tr.transform(df_train['clean_tweet'])
pickle.dump(tr.vectorizer, open('../models/vectorizer.sav', 'wb'))
x_train, x_valid, y_train, y_valid = train_test_split(df_tfidf, df_train['label'],test_size=0.3,random_state=42)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunct and selfish drag hi kid i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thank for #lyft credit can use caus they don o...
2,3,0,bihday your majesty,bihday your majesti
3,4,0,#model i love u take with u all the time in ...,#model love take with all the time
4,5,0,factsguide: society now #motivation,factsguid societi now #motiv


# Models and balancing

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score




model = LogisticRegression(random_state=0,solver='lbfgs')
model.fit(x_train, y_train)

In [8]:
predictions = model.predict_proba(x_valid)
predictions

array([[0.97822924, 0.02177076],
       [0.97365865, 0.02634135],
       [0.84763288, 0.15236712],
       ...,
       [0.95255636, 0.04744364],
       [0.76914456, 0.23085544],
       [0.94616521, 0.05383479]])

In [9]:
prediction_int = predictions[:,1]>=0.3
prediction_int = prediction_int.astype(np.int)
log_tfidf = f1_score(y_valid, prediction_int)

log_tfidf

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
pickle.dump(model, open('../models/model.sav', 'wb'))
 

In [13]:
test_data = tr.transform(['This thing is pretty shit', 'I like this pretty much, it is super', 'ugly bad ugly'])
model.predict_proba(test_data)


array([[0.94886958, 0.05113042],
       [0.93658547, 0.06341453],
       [0.9530083 , 0.0469917 ]])

In [41]:
from imblearn.under_sampling import TomekLinks,RandomUnderSampler, CondensedNearestNeighbour,EditedNearestNeighbours
#balancing

tl = RandomUnderSampler(sampling_strategy='not minority',random_state=1337) # default - will remove the sample from the majority class
#tl = TomekLinks(sampling_strategy='not minority',n_jobs = -1)
x_train_res, y_train_res = tl.fit_resample(x_train, y_train)
model2 = LogisticRegression(random_state=1337,solver='lbfgs')

model2.fit(x_train_res, y_train_res)


'LogisticRegression(random_state=1337)'

In [None]:
pickle.dump(model, open('../models/model2.sav', 'wb'))

In [37]:
predictions = model2.predict_proba(x_valid)
prediction_int = predictions[:,1]>=0.3
prediction_int = prediction_int.astype(np.int32)
log_tfidf = f1_score(y_valid, prediction_int)


In [38]:
test_data = tr.transform(["it's unbelievable that in the 21st century we'd need something like this. again. #neverump  #xenophobia ", 'This thing is pretty shit', 'I like this pretty much, it is super', 'ugly bad ugly'])
model2.predict_proba(test_data)

array([[0.38324213, 0.61675787],
       [0.6438945 , 0.3561055 ],
       [0.57561075, 0.42438925],
       [0.63779563, 0.36220437]])

In [None]:
def analyze_data(model, data_path, data_transformer, data_balancer=None, save_model=True, save_vectorizer=True, verbose=True):
    data_name = data_path.split('/')[-1]
    name = f"{data_name}-:{model.__repr__()}"
    if verbose:
        print(f'Analyzing {name}')
    # Load data
    df_train = pd.read_csv('data/train.csv')
    # Clean data
    df_train['clean_tweet'] = np.vectorize(data_cleaner.clean_mentions)(df_train['tweet'])
    df_train['clean_tweet'] = df_train['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
    df_train['clean_tweet'] = df_train['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform data
    df_train['clean_tweet'] = data_transformer.stemming(df_train['clean_tweet'])
    data_transformer.vectorizer_fit(df_train['clean_tweet'])
    df_tfidf = data_transformer.transform(df_train['clean_tweet'])
    if save_vectorizer:
        pickle.dump(tr.vectorizer, open(f'../models/{data_name}-{tr.vectorizer.__repr__()}.sav', 'wb'))
    x_train, x_valid, y_train, y_valid = train_test_split(df_tfidf, df_train['label'],test_size=0.3,random_state=1337)
    # Balance data
    if data_balancer is not None:
        x_train, y_train = tl.fit_resample(x_train, y_train)
    # Fit model (HYPERPARAMS???)
    model.fit(x_train, y_train)
    if save_model:
        pickle.dump(model, open(f'../models/{name}', 'wb'))
    # Predict
    predictions = model.predict_proba(x_valid)
    # Evaluate
    return name