# TFIDF_Linear_simple_baseline

## Imports

In [None]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

import gc 
import re 
import scipy
from scipy import sparse

from pprint import pprint
from IPython.display import display
from matplotlib import pyplot as plt 

import time
import warnings
import scipy.optimize as optimize

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, LinearRegression, RidgeCV, ElasticNetCV, LassoCV, BayesianRidge, HuberRegressor, PassiveAggressiveRegressor

## Load data

In [None]:
df_train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
# Create a score that measure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

# Not good
# cat_mtpl = {'toxic': 1.0, 'severe_toxic': 2.5, 'obscene': 1.0,
#             'threat': 2.0, 'insult': 1.5, 'identity_hate': 2.0}

# cat_mtpl = {'toxic': 1.0, 'severe_toxic': 2.0, 'obscene': 1.0,
#             'threat': 1.0, 'insult': 1.0, 'identity_hate': 1.0}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new.head(2)

In [None]:
df_train = df_train.rename(columns={'comment_text':'text'})

### Text cleaning

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''

    template = re.compile(r'https?://\S+|www\.\S+') # Remove website links
    text = template.sub(r'', text)

    soup = BeautifulSoup(text, 'lxml') # Remove HTML tags
    only_text = soup.get_text()
    text = only_text

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)
    text = re.sub(r"[^a-zA-Z\d]", " ", text) # Remove special Charecters
    text = re.sub(' +', ' ', text) # Remove Extra Spaces
    text = text.strip() # Remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)

In [None]:
df = df_train.copy()

## Undersampling

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len * 2, random_state=402)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

## TF-IDF

In [None]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
X

## Fit models

In [None]:
%%time
model = Ridge(alpha=2.5)
model.fit(X, df['y'])

In [None]:
%%time
l_model = PassiveAggressiveRegressor(random_state=42, C=0.1, early_stopping=True)
l_model.fit(X, df['y'])

In [None]:
%%time
s_model = LinearSVR(C=1.0, random_state=42)
s_model.fit(X, df['y'])

In [None]:
model2 = Ridge(alpha=1.)
model2.fit(X, df['y'])

In [None]:
%%time
l_model2 = Ridge(alpha=2.)
l_model2.fit(X, df['y'])

In [None]:
%%time
s_model2 = Ridge(alpha=10.)
s_model2.fit(X, df['y'])

## Prepare validation data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_val.head()

## Text cleaning

In [None]:
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

In [None]:
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
p1_m = model.predict(X_less_toxic)
p2_m = model.predict(X_more_toxic)

# Validation Accuracy
(p1_m < p2_m).mean()

In [None]:
p1_m2 = model2.predict(X_less_toxic)
p2_m2 = model2.predict(X_more_toxic)

# Validation Accuracy
(p1_m2 < p2_m2).mean()

In [None]:
p1_l = l_model.predict(X_less_toxic)
p2_l = l_model.predict(X_more_toxic)

# Validation Accuracy
(p1_l < p2_l).mean()

In [None]:
p1_l2 = l_model2.predict(X_less_toxic)
p2_l2 = l_model2.predict(X_more_toxic)

# Validation Accuracy
(p1_l2 < p2_l2).mean()

In [None]:
p1_s = s_model.predict(X_less_toxic)
p2_s = s_model.predict(X_more_toxic)

# Validation Accuracy
(p1_s < p2_s).mean()

In [None]:
p1_s2 = s_model2.predict(X_less_toxic)
p2_s2 = s_model2.predict(X_more_toxic)

# Validation Accuracy
(p1_s2 < p2_s2).mean()

In [None]:
# LR models
( (p1_m + p1_l + p1_s)/3 < (p2_m + p2_l + p2_s)/3 ).mean()

In [None]:
# Ridge models
( (p1_m2 + p1_l2 + p1_s2)/3 < (p2_m2 + p2_l2 + p2_s2)/3 ).mean()

In [None]:
lsub = pd.DataFrame()
lsub['Ridge2.5'] = p1_m
lsub['PassiveAggressive'] = p1_l
lsub['SVR'] = p1_s
lsub['Ridge1'] = p1_m2
lsub['Ridge2'] = p1_l2
lsub['Ridge10'] = p1_s2

rsub = pd.DataFrame()
rsub['Ridge2.5'] = p2_m
rsub['PassiveAggressive'] = p2_l
rsub['SVR'] = p2_s
rsub['Ridge1'] = p2_m2
rsub['Ridge2'] = p2_l2
rsub['Ridge10'] = p2_s2

(lsub.mean(axis=1) < rsub.mean(axis=1)).mean()

In [None]:
# Average
( ( 0.5*(p1_m/3 + p1_l/3 + p1_s/3) + 0.5*(p1_m2/3 + p1_l2/3 + p1_s2/3) )   <   ( 0.5*(p2_m/3 + p2_l/3 + p2_s/3) + 0.5*(p2_m2/3 + p2_l2/3 + p2_s2/3) ) ).mean()

In [None]:
lr_sub = pd.concat([lsub, rsub])

check_l = lr_sub.rank(method='average').iloc[:30108]
check_r = lr_sub.rank(method='average').iloc[30108:]

(check_l.mean(axis=1) < check_r.mean(axis=1)).mean()

## Prepare submission

In [None]:
base_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

tqdm.pandas()
base_sub['text'] = base_sub['text'].progress_apply(text_cleaning)

In [None]:
X_test = vec.transform(base_sub['text'])

p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)

In [None]:
base_sub['Ridge2.5'] = p3
base_sub['PassiveAggressive'] = p4
base_sub['SVR'] = p5

In [None]:
# base_sub['score'] = (p3 + p4 + p5) / 3.

In [None]:
# base_sub2 = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

# tqdm.pandas()
# base_sub2['text'] = base_sub2['text'].progress_apply(text_cleaning)

In [None]:
p3 = model2.predict(X_test)
p4 = l_model2.predict(X_test)
p5 = s_model2.predict(X_test)

base_sub['Ridge1'] = p3
base_sub['Ridge2'] = p4
base_sub['Ridge10'] = p5

# base_sub2['score'] = (p3 + p4 + p5) / 3.

In [None]:
base_scores = base_sub.iloc[:, -6:].copy()

In [None]:
# base_sub[['comment_id', 'score']].to_csv("base_submission.csv", index=False)
# base_sub2[['comment_id', 'score']].to_csv("base_submission2.csv", index=False)

# Ensemble of simple TF-Idf and Ridge regression

### Ensemble of TfIdf - Ridge models using data from 
- Toxic competition
- Toxic CLEANED competition
- Ruddit toxic data
- Toxic multilingual competition


In [None]:
def timer(func):
    def wrapper(*args, **kws):
        st = time.time()
        res = func(*args, **kws)
        et = time.time()
        tt = (et-st)/60
        print(f'Time taken is {tt:.2f} mins')
        return res
    return wrapper

# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
df_test_l = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
print(df_test.shape)
df_test = pd.merge(df_test, df_test_l, how="left", on = "id")
df_test.shape

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)
df = pd.concat([df, df_test])
print(df.shape)
del df_test

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(df.loc[df[col]==1,['comment_text',col]].sample(5))

In [None]:

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df['y'].value_counts()

# Load validation data & filter for overlapping sentences

In [None]:
# Validation data 

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)


# Find cases already present in toxic data

df_val = pd.merge(df_val, df.loc[:,['text']], 
                  left_on = 'less_toxic', 
                  right_on = 'text', how='left')

df_val = pd.merge(df_val, df.loc[:,['text']], 
                  left_on = 'more_toxic', 
                  right_on = 'text', how='left')

# Removing those cases
df_val = df_val[(~df_val.text_x.isna()) | (~df_val.text_y.isna())][['worker', 'less_toxic', 'more_toxic']]
df_val.shape

## Create 3 versions of the TOXIC data

In [None]:
n_folds = 2

frac_1 = 0.7
frac_1_factor = 1.3

In [None]:
@timer
def create_folds():
    for fld in range(n_folds):
        print(f'Fold: {fld}')
        tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                            df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                                random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

        tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index=False)
        print(tmp_df.shape)
        print(tmp_df['y'].value_counts())


create_folds()

# Create 3 versions of __clean__ TOXIC data

In [None]:
@timer
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    
    return data

In [None]:
# Test clean function
test_clean_df = pd.DataFrame({"text":
                              ["heyy\n\nkkdsfj",
                               "hi   how/are/you ???",
                               "hey?????",
                               "hey????? 18.98.333.20 18.98.",
                               "noooo!!!!!!!!!   comeone !! ",
                              "cooooooooool     brooooooooooo  coool brooo",
                              "naaaahhhhhhh"]})
display(test_clean_df)
clean(test_clean_df,'text')

In [None]:
df = clean(df,'text')

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

    tmp_df.to_csv(f'/kaggle/working/df_clean_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

In [None]:
del df,tmp_df
gc.collect()

## Read toxic Ruddit data

In [None]:
df_ = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(df_.shape)

df_ = df_[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})

df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min()) 
df_.y.hist()

# Create 3 versions of RUDDIT data

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = df_.sample(frac=frac_1, random_state = 10*(fld+1))
    tmp_df.to_csv(f'/kaggle/working/df2_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

In [None]:
del tmp_df, df_; 
gc.collect()

## Read Jigsaw multilingual data CLEANED

In [None]:
dfm = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
print(dfm.shape)

dfm = clean(dfm,'comment_text')

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(dfm.loc[dfm[col]==1,['comment_text',col]].sample(5))
    

# Give more weight to severe toxic 
dfm['severe_toxic'] = dfm.severe_toxic * 2
dfm['y'] = (dfm[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
dfm['y'] = dfm['y']/dfm['y'].max()

dfm = dfm[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
dfm.y.value_counts()

# Create 3 versions of Multilingual data

In [None]:

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([dfm[dfm.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        dfm[dfm.y==0].sample(n=int(len(dfm[dfm.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, 
                                                                                        random_state = 10*(fld+1))

    tmp_df.to_csv(f'/kaggle/working/dfm_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

## Load Test data  


In [None]:
# Test data
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub.shape

# Create Sklearn Pipeline with 
-  TFIDF - Take 'char_wb' as analyzer to capture subwords well
-  Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

In [None]:
class LengthTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[(len(x)-360)/550] for x in X])
    def get_feature_names(self):
        return ["lngth"]

class LengthUpperTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[int(sum([1 for y in x if y.isupper()])/len(x) > 0.75) ] for x in X])
    def get_feature_names(self):
        return ["lngth_uppercase"]

## Train pipeline

- Load folds data
- train pipeline
- Predict on validation data
- Predict on test data

# Training function

In [None]:
@timer
def train_pipeline(pipeline, data_path_name, n_folds, clean_prm = False):
    val_preds_arr1_tmp = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2_tmp = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr_tmp = np.zeros((df_sub.shape[0], n_folds))

    for fld in range(n_folds):
        print("\n\n")
        print(f' ****************************** FOLD: {fld} ******************************')
        df = pd.read_csv(f'/kaggle/working/{data_path_name}_fld{fld}.csv')
        print(df.shape)

        print("\nTrain:")
        # Train the pipeline
        pipeline.fit(df['text'], df['y'])

        # What are the important features for toxicity

        print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

        feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                      np.round(pipeline['clf'].coef_,2) )), 
                             key = lambda x:x[1], 
                             reverse=True)

        display(pd.DataFrame(feature_wts[:50], columns = ['feat','val']).T)
        #.plot('feat','val',kind='barh',figsize = (8,8) )
        #plt.show()

        if clean_prm:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(clean(df_val,'less_toxic')['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(clean(df_val,'more_toxic')['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(clean(df_sub,'text')['text'])
        else:
            print("\npredict validation data ")
            val_preds_arr1_tmp[:,fld] = pipeline.predict(df_val['less_toxic'])
            val_preds_arr2_tmp[:,fld] = pipeline.predict(df_val['more_toxic'])

            print("\npredict test data ")
            test_preds_arr_tmp[:,fld] = pipeline.predict(df_sub['text'])
    return val_preds_arr1_tmp, val_preds_arr2_tmp, test_preds_arr_tmp

# Toxic Training

In [None]:
features = FeatureUnion([
    #('vect1', LengthTransformer()),
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),
    #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
        #("clf",LinearRegression())
    ]
)

val_preds_arr1, val_preds_arr2, test_preds_arr = train_pipeline(pipeline, 
                                                                "df", 
                                                                n_folds,
                                                                clean_prm=False)


# Toxic __clean__ Training

In [None]:
features = FeatureUnion([
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
    ]
)

val_preds_arr1c, val_preds_arr2c, test_preds_arrc = train_pipeline(pipeline, 
                                                                   "df_clean", 
                                                                   n_folds,
                                                                   clean_prm=True)


## Ruddit data Training

In [None]:
features = FeatureUnion([
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),

])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
    ]
)

val_preds_arr1_, val_preds_arr2_, test_preds_arr_ = train_pipeline(pipeline, 
                                                                   "df2", 
                                                                   n_folds,
                                                                   clean_prm=False)


## Mulitlingual data Training

In [None]:
features = FeatureUnion([
    #('vect1', LengthTransformer()),
    #('vect2', LengthUpperTransformer()),
    ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, 
                              analyzer = 'char_wb', ngram_range = (3,5))),
    #("vect4", CountVectorizer(min_df= 5, max_df=0.3, analyzer = 'word', ngram_range = (2,3), token_pattern=r'(?u)\b\w{3,}\b', binary=True))
])
pipeline = Pipeline(
    [
        ("features", features),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge()),
        #("clf",LinearRegression())
    ]
)

val_preds_arr1m, val_preds_arr2m, test_preds_arrm = train_pipeline(pipeline, 
                                                                   "dfm", 
                                                                    n_folds,
                                                                    clean_prm=True)

# Validate the pipeline 

In [None]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')

print(" Toxic Mulitlingual data ")
p7 = val_preds_arr1m.mean(axis=1)
p8 = val_preds_arr2m.mean(axis=1)

print(f'Validation Accuracy is { np.round((p7 < p8).mean() * 100,2)}')

## Optimize the model weights for ensemble

In [None]:
@timer
def optimize_wts():
    func = lambda x: -1*(((x[0]*p1 + x[1]*p3 + x[2]*p5 + x[3]*p7) < (x[0]*p2 + x[1]*p4 + x[2]*p6  + x[3]*p8)).mean())

    rranges = (slice(0.20, 0.6, 0.015), 
               slice(0.05, 0.5, 0.015),
               slice(0.05, 0.5, 0.015),
               slice(0.05, 0.5, 0.015),
              )

    resbrute = optimize.brute(func, 
                              rranges, 
                              #args=params, 
                              full_output=True,
                              finish=None)
    return resbrute

resbrute = optimize_wts()

print(resbrute[0])  # global minimum
print(resbrute[1]*-1)  # function value at global minimum

In [None]:
w1,w2,w3,w4 = resbrute[0]
# print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5 + w4*p7
p2_wt = w1*p2 + w2*p4 + w3*p6 + w4*p8

## Analyze bad predictions 

In [None]:
# Mega ridge
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')

(p1_wt < p2_wt).mean()

In [None]:
### Incorrect predictions with similar scores

df_val[(df_val.correct == 0) & (df_val.p1 < 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(2)

In [None]:
df_val[(df_val.correct == 0) & (df_val.p1 > 0.5*df_val.p1.max())].sort_values('diff', ascending=True).head(2)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(2)

In [None]:
df_val[(df_val.correct == 0) & (df_val['diff'] < 0.4*df_val['diff'].max())].sort_values('diff', ascending=False).head(2)

# Predict on test data 

In [None]:
# Predict using pipeline
df_sub['score'] = w1*test_preds_arr.mean(axis=1) + \
                  w2*test_preds_arr_.mean(axis=1) + \
                  w3*test_preds_arrc.mean(axis=1) + \
                  w4*test_preds_arrm.mean(axis=1)

## Correct the rank ordering

In [None]:
base_scores['MegaRidge'] = df_sub['score']
cols = base_scores.columns.to_list()

base_scores.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

base_scores2 = base_scores.copy()

for col in cols:
    sc = MinMaxScaler()
    base_scores2[col] = sc.fit_transform(base_scores2[col].values.reshape(-1,1))

In [None]:
# df_sub['score'] = pd.DataFrame([base_scores.rank(method='average')[c] * weights[c] for c in cols]).T.sum(axis=1).rank(method='average')
# df_sub['score'] = base_scores.rank(method='average').mean(axis=1)

weights1 = {
    'Ridge2.5': 0.1,
    'PassiveAggressive': 0.15,
    'SVR': 0.15,
    'Ridge1': 0.05,
    'Ridge2': 0.05,
    'Ridge10': 0.05,
    'MegaRidge': 0.45
}

weights2 = {
    'Ridge2.5': 0.05,
    'PassiveAggressive': 0.2,
    'SVR': 0.2,
    'Ridge1': 0.05,
    'Ridge2': 0.05,
    'Ridge10': 0.05,
    'MegaRidge': 0.4
}

# df_sub['score'] = pd.DataFrame([base_scores2[c] * weights1[c] for c in cols]).T.sum(axis=1).rank(method='average')
df_sub['score'] = base_scores2.mean(axis=1).rank(method='first')
# df_sub['score'] = pd.DataFrame([base_scores2[c] * weights2[c] for c in cols]).T.sum(axis=1)

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
print(df_sub.duplicated('score').value_counts())

In [None]:
same_score = df_sub['score'].value_counts().reset_index()[:14]
same_score

In [None]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

# Create average solution

In [None]:
# sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
# sub['score'] = 0.5*(0.5*base_sub['score'] + 0.5*base_sub2['score']) + 0.5*df_sub['score']
# sub[['comment_id', 'score']].to_csv("submission.csv", index=False)