LOADING THE TRAINING AND TESTING DATA

In [None]:
import ast
import pandas as pd

from google.colab import drive
drive.mount("/content/gdrive")

import numpy as np

import sklearn
from sklearn.model_selection import train_test_split

In [None]:
trainDataDf = pd.read_csv("/content/gdrive/MyDrive/project_nlp/data/Hate_Offensive_Language_Identification_train.csv")

In [None]:
testDataDf = pd.read_csv("/content/gdrive/MyDrive/project_nlp/data/Hate_Offensive_Language_Identification_test.csv")

In [None]:
trainTweets = list(trainDataDf.iloc[:, 0])

In [None]:
trainTweetsLabels = list(trainDataDf.iloc[:, 1])

In [None]:
testTweets = list(testDataDf.iloc[:, 0])

FORMING THE Y_TRAIN

In [None]:
y_train = []
for i in trainTweetsLabels:
    if i == 'NOT':
        y_train.append(1)
    else:
        y_train.append(0)
y_train = np.asarray(y_train)

TEXT PREPROCESSING

In [None]:
!pip install emoji
!pip install autocorrect
!pip install googletrans==3.1.0a0

In [None]:
import re
import numpy as np
import emoji
import string
characters = string.punctuation

from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from autocorrect import Speller
spell = Speller()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
def userid(tweet):
    count = 0
    for i in tweet.split():
        if i[0] == '@':
            count += 1
    return count

In [None]:
def profanity_vector(tweet):
    bad_words = pd.read_csv('/content/gdrive/MyDrive/project_nlp/data/Hinglish_Profanity_List.csv', engine='python', header=None, encoding='cp1252')
    bad_words.columns = ['Hinglish', 'English', 'Level']
    english = bad_words['English']. values
    hinglish = bad_words['Hinglish']. values
    level = bad_words['Level'].values
    PV = [0] * len(level)
    for word in tweet.split():
        if word in english:
            idx = np.where(english == word)
            PV[idx[0][0]] = level[idx][0]
        elif  word in hinglish:
            idx = np.where(hinglish == word)
            PV[idx[0][0]] = level[idx][0]
    return PV

In [None]:
def stopword(tweet):
    stopwords = stopwords.words('english')
    tokens = word_tokenize(tweet)
    removed_stopwords = []
    for token in tokens:
        if (token not in stopwords):
            removed_stopwords.append(token)
    return ' '.join(removed_stopwords)

In [None]:
def translation(tweet):
  from googletrans import Translator
  translator = Translator(service_urls=['translate.googleapis.com'])
  translated_tweet = translator.translate(tweet).text
  return translated_tweet.lower()

In [None]:
def more_cleaning(tweet):
    tokens = word_tokenize(tweet)
    final_tokens = []
    for token in tokens:
        if token not in characters:
            token = spell(token)
            token = lemmatizer.lemmatize(token)
            final_tokens.append(token)
    return (' ').join(final_tokens)

In [None]:
def text_preprocessing(data):
    user_ids = []   # a list storing the ids of the
    clean_data_hinglish = []
    clean_translated_data = []
    prof_vector = []
    
    for tweet in tqdm(data):
        userids = userid(tweet)
        tweet = re.sub("((https?://|www\.)([\w-]+\.){1,}[a-zA-Z]+(\/([\w\~\-]|\.(?!\s))*)*)|(([\w-]+\.){1,}(com|net|org|io|gov)(\/([\w\~\-]|\.(?!\s))*)*)", "", tweet)
        tweet = emoji.demojize(tweet)
        tweet = re.sub(r'\\n', '  ', tweet) # replacing '\\n' with a space
        tweet = re.sub(r'RT|rt', '', tweet)
        translated_tweet = translation(tweet)
        clean_text = []
        
        tokens = word_tokenize(translated_tweet)
        for word in tokens:
            if word[0] == '@':
                clean_word = re.sub(word, 'username', word)
            else:
                clean_word = word.lower()
                clean_word = re.sub(r'^[#@]\w+', ' ', clean_word) # removing words like #Modi, #Hindu
                if word in characters:
                    continue
                clean_text.append(clean_word)
        
        clean_text = (' '). join(clean_text)
        clean_text = re.sub(r'[^\w+\s+]', '', clean_text)
        PV = profanity_vector(clean_text)
        translated_tweet  = more_cleaning(clean_text)
        
        user_ids.append(userids)
        clean_data_hinglish.append(clean_text)
        clean_translated_data.append(translated_tweet)
        prof_vector.append(PV)
    
    clean_data_hinglish = np.asarray(clean_data_hinglish)
    user_ids = np.asarray(user_ids).reshape(-1, 1)
    prof_vector = np.asarray(prof_vector)
    clean_translated_data = np.asarray(clean_translated_data)
    
    return user_ids, prof_vector, clean_translated_data   

In [None]:
user_ids_train, prof_vector_train, processed_translated_train_data = text_preprocessing(trainTweets)

In [None]:
user_ids_test, prof_vector_test, processed_translated_test_data  = text_preprocessing(testTweets)

In [None]:
dataframe_processed_train = pd.DataFrame(list(zip(trainTweets, user_ids_train.tolist(), prof_vector_train.tolist(), processed_translated_train_data.tolist(), trainTweetsLabels)),\
                               columns=['RawTweet', 'user_ids', 'prof_vector', 'Translated_Data_Clean', 'Label'])
dataframe_processed_train.to_csv("processed_train_data_inserted_all_listm.csv")

In [None]:
dataframe_processed_test = pd.DataFrame(list(zip(testTweets, user_ids_train.tolist(), prof_vector_train.tolist(),  processed_translated_test_data.tolist())),\
                               columns=['RawTweet', 'user_ids', 'prof_vector', 'Translated_Data_Clean'])
dataframe_processed_test.to_csv("processed_test_data_inserted_all_listm.csv")

LOADING THE DATA SAVED IN PREPROCESSING

For Train

In [None]:
dataframe_processed_train = pd.read_csv("/content/gdrive/MyDrive/project_nlp/data/processed_train_data_inserted_all_listm.csv")

For test

In [None]:
dataframe_processed_test = pd.read_csv("/content/gdrive/MyDrive/project_nlp/data/processed_test_data_inserted_all_listm.csv")

SPLITTING THE DATASET ONLY FOR THE CASE OF EVALUATION

In [None]:
dataframe_processed_train, dataframe_processed_val, y_train, y_val = train_test_split(dataframe_processed_train, y_train, test_size=0.25, random_state=42)

Extracting the features from the Loaded Data

FOR TRAIN

In [None]:
processed_translated_train_data = dataframe_processed_train['Translated_Data_Clean']
processed_translated_train_data = np.asarray(list(processed_translated_train_data))

In [None]:
user_ids_train = dataframe_processed_train['user_ids']
user_ids_train = np.asarray(list(user_ids_train.apply(ast.literal_eval)))

In [None]:
prof_vector_train = dataframe_processed_train['prof_vector']
prof_vector_train = np.asarray(list(prof_vector_train.apply(ast.literal_eval)))

FOR VAL

In [None]:
processed_translated_val_data = dataframe_processed_val['Translated_Data_Clean']
processed_translated_val_data = np.asarray(list(processed_translated_val_data))

In [None]:
user_ids_val = dataframe_processed_val['user_ids']
user_ids_val = np.asarray(list(user_ids_val.apply(ast.literal_eval)))

In [None]:
prof_vector_val = dataframe_processed_val['prof_vector']
prof_vector_val = np.asarray(list(prof_vector_val.apply(ast.literal_eval)))

FOR TEST

In [None]:
processed_translated_test_data = dataframe_processed_test['Translated_Data_Clean']
processed_translated_test_data = np.asarray(list(processed_translated_test_data))

In [None]:
user_ids_test = dataframe_processed_test['user_ids']
user_ids_test = np.asarray(list(user_ids_test.apply(ast.literal_eval)))

In [None]:
prof_vector_test = dataframe_processed_test['prof_vector']
prof_vector_test = np.asarray(list(prof_vector_test.apply(ast.literal_eval)))

In [None]:
t = pd.Series(y_train)

In [None]:
t.value_counts()

1    4064
0    3529
dtype: int64

FEATURE FORMATION or EXTRACTION

FINDING THE SENTENCE EMBEDDINGS

Universal Language Model FIne-Tuning(ULMFIT

In [None]:
!pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
!pip3 install tensorflow-hub



In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

2022-12-02 11:22:54.737010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-02 11:22:54.874217: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-02 11:22:54.874248: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-02 11:22:55.742811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

2022-12-02 11:22:57.385303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-02 11:22:57.385347: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-02 11:22:57.385372: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-wsp-2d41n6mb44o5-5fuidm2e2e29meu): /proc/driver/nvidia/version does not exist
2022-12-02 11:22:57.385585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
X_train1 = model(processed_translated_train_data)
X_val1 = model(processed_translated_val_data)

BERT based

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, losses, models
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
def form_data(data_X, data_Y):
    samples = []
    n_samples = data_X.shape[0]
    for i in range(n_samples):
        samples.append(InputExample(texts=[data_X[i]], label=data_Y[i]))
    dataloader = DataLoader(samples, shuffle=True, batch_size=25)
    return dataloader

def get_model_predicts(data_type, trained_model):
    score_samples = []
    for i in range(len(data_type)):
        sample = data_type[i]
    score_samples.append(cosine_similarity(np.array([trained_model.encode(sample[0])]), np.array([trained_model.encode(sample[1])])))
    return score_samples

In [None]:
def change_param_req_grad(model, makeChange, noOflayersToChange):
    if makeChange:
      numberOfLayers = sum(1 for _ in model.parameters())
      count = 1
      param_generator=model.parameters()
      while True:
        if(count > numberOfLayers-noOflayersToChange):
          break
        else:
          param = next(param_generator)
          param.requires_grad = False
        count += 1

In [None]:
# dataloader
dataloader_train = form_data(processed_translated_train_data, y_train)

# base_model
base_model = models.Transformer('sentence-transformers/bert-base-nli-mean-tokens')
change_param_req_grad(base_model, True, 1)

# layer_ppoling
layer_ppoling = models.Pooling(base_model.get_word_embedding_dimension())

# layer_dense
layer_dense = models.Dense(in_features=layer_ppoling.get_sentence_embedding_dimension(), out_features=200, activation_function=nn.Tanh())

modelSbert = SentenceTransformer(modules=[base_model, layer_ppoling, layer_dense])

# loss
loss = losses.BatchAllTripletLoss(model=modelSbert)

modelSbert.fit(train_objectives=[(dataloader_train, loss)], epochs=1, warmup_steps=100, show_progress_bar=True)

2022-12-02 17:28:49.755839: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/304 [00:00<?, ?it/s]

For Validation

In [None]:
X_train2 = []
X_val2 = []

for i in range(processed_translated_train_data.shape[0]):
    X_train2.append(modelSbert.encode(processed_translated_train_data[i]))
X_train2 = np.array(X_train2)

for i in range(processed_translated_val_data.shape[0]):
    X_val2.append(modelSbert.encode(processed_translated_val_data[i]))
X_val2 = np.array(X_val2)

For test

In [None]:
X_train2 = []
X_test2 = []

for i in range(processed_translated_train_data.shape[0]):
    X_train2.append(modelSbert.encode(processed_translated_train_data[i]))
X_train2 = np.array(X_train2)

for i in range(processed_translated_test_data.shape[0]):
    X_test2.append(modelSbert.encode(processed_translated_test_data[i]))
X_test2 = np.array(X_test2)

XLNet based

In [None]:
!pip install transformers

In [None]:
from transformers import XLNetTokenizer, XLNetModel

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

In [None]:
X_train3 = []
for i in range(processed_translated_train_data.shape[0]):
    inputs = tokenizer(processed_translated_train_data[i], return_tensors="pt")
    outputs = model(**inputs)
    X_train3.append(outputs[0][0][0].detach().numpy().tolist())
X_train3 = np.array(X_train3)

X_val3 = []
for i in range(processed_translated_val_data.shape[0]):
    inputs = tokenizer(processed_translated_val_data[i], return_tensors="pt")
    outputs = model(**inputs)
    X_val3.append(outputs[0][0][0].detach().numpy().tolist())
X_val3 = np.array(X_val3)

In [None]:
inputs = tokenizer("hello how r you", return_tensors="pt")
outputs = model(**inputs)

In [None]:
outputs

XLNetModelOutput(last_hidden_state=tensor([[[ 1.4035, -2.3553, -0.9075,  ..., -2.9300, -0.3122, -0.5953],
         [ 2.2118, -0.9352, -1.5000,  ..., -1.8375,  1.9391, -0.3130],
         [ 1.7653, -2.0395, -2.1579,  ...,  0.3441,  1.8899,  0.3688],
         ...,
         [ 3.2499, -1.6211, -1.4987,  ..., -1.1418,  1.4020, -0.8341],
         [ 4.2526,  0.1650, -2.8490,  ..., -2.3027, -0.5520, -0.3068],
         [ 3.9433,  0.6612, -2.4114,  ..., -2.1803, -1.1960,  0.1435]]],
       grad_fn=<PermuteBackward0>), mems=(tensor([[[-6.7457e-03, -6.4085e-02,  6.9629e-02,  ..., -1.2237e-01,
          -2.1938e-02, -6.5539e-05]],

        [[-6.8064e-02,  1.3153e-02, -3.3215e-02,  ..., -4.8097e-02,
           3.6501e-02, -4.7987e-02]],

        [[-3.8695e-02, -9.8694e-03, -5.3354e-03,  ...,  6.4287e-02,
           1.9249e-02, -1.4396e-02]],

        ...,

        [[-1.9789e-03,  2.6312e-02, -4.1397e-02,  ..., -5.2209e-02,
          -5.3612e-04, -4.4180e-02]],

        [[ 7.8792e-02, -5.8267e-02, -9.

COMBINING THE FEATURES I.E. EMBEDDINGS + SENTIMENT ETC.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
def get_sentiment(tweet):
    
    ''' This function calculates the NLTK sentiments and return the negative, neutral, postive and compound values'''
    neg = []
    neu = []
    pos = []
    comp = []
    
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(tweet)
    
    neg.append(sentiment_score['neg'])
    neu.append(sentiment_score['neu'])
    pos.append(sentiment_score['pos'])
    comp.append(sentiment_score['compound'])
    
    return neg, neu, pos, comp

In [None]:
def feature_combining(processed_data_train, processed_data_test, userids_train, userids_test, PV_train, PV_test, train_embeddings, test_embeddings):

    negative_train, negative_test = [], []
    neutral_train, neutral_test = [], []
    positive_train, positive_test  = [], []
    compound_train, compound_test  = [], []

    for tweet in processed_data_train:
        neg, neu, pos, comp = get_sentiment(tweet)
        negative_train.append(neg), neutral_train.append(neu), positive_train.append(pos), compound_train.append(comp)
    
    for tweet in processed_data_test:
        neg, neu, pos, comp = get_sentiment(tweet)
        negative_test.append(neg), neutral_test.append(neu), positive_test.append(pos), compound_test.append(comp)
        
    negative_train, negative_test = np.asarray(negative_train), np.asarray(negative_test)
    neutral_train, neutral_test = np.asarray(neutral_train), np.asarray(neutral_test)
    positive_train, positive_test  = np.asarray(positive_train), np.asarray(positive_test)
    compound_train, compound_test = np.asarray(compound_train), np.asarray(compound_test)
    
    train_dataset = np.hstack((userids_train, PV_train, negative_train, neutral_train, positive_train, compound_train, train_embeddings))
    test_dataset = np.hstack((userids_test, PV_test, negative_test, neutral_test, positive_test, compound_test, test_embeddings))
    return train_dataset, test_dataset

CASE OF EVALUATION

In [None]:
X_train2, X_val2 = feature_combining(processed_translated_train_data, processed_translated_val_data, user_ids_train, user_ids_val, prof_vector_train, prof_vector_val, X_train2, X_val2)

FORMING THE CLASSIFIER

MLP CLASSIFIER

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
def mlpClassifier(X_train, y_train, X_test):
  clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
  predictions = clf.predict(X_test)
  return predictions

RFC

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def rfcClassifier(X_train, y_train, X_test):
  rfc_clf = RandomForestClassifier(random_state=42)
  rfc_clf.fit(standardize(X_train), y_train)
  rfc_clf_pred = rfc_clf.predict(X_test)
  return rfc_clf_pred

SVM

In [None]:
from sklearn import svm

In [None]:
def svmClassifier(X_train, y_train, X_test):
  svm_clf = svm.SVC(random_state=42)
  svm_clf.fit(standardize(X_train), y_train)
  svm_clf_pred = svm_clf.predict(X_test)
  return svm_clf_pred

XGBClassifier

In [None]:
from xgboost import XGBClassifier

In [None]:
def xgbClassifier(X_train, y_train, X_test):
    xgb_clf = XGBClassifier(random_state=42)
    xgb_clf.fit(X_train, y_train)
    xgb_clf_pred = xgb_clf.predict(X_test)
    return xgb_clf_pred

CALCULATING MATRICES

In [None]:
from sklearn.metrics import f1_score

In [None]:
def calF1Score(predictions, actual):
  return f1_score(actual, predictions, average='macro')

CALLING THE CLASSIFIERS

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def standardize(data):
  scaler = StandardScaler()
  return scaler.fit_transform(data)

STANDARDIZE DATA

IN CASE OF EVALAUTION

When Trying out all the Embeddings

In [None]:
x_train1 = standardize(X_train1)
X_train2 = standardize(X_train2)
X_train3 = standardize(X_train3)

X_val1 = standardize(X_val1)
X_val2 = standardize(X_val2)
X_val3 = standardize(X_val3)

When Trying for Bert Based Embedding only

In [None]:
X_train2 = standardize(X_train2)
X_val2 = standardize(X_val2)

mlp calling

In [None]:
predictions1 = mlpClassifier(X_train1, y_train, X_val1)
calF1Score(predictions1, y_val)

In [None]:
predictions2 = mlpClassifier(X_train2, y_train, X_val2)
calF1Score(predictions2, y_val)

0.750417515300523

In [None]:
predictions2

array([0, 1, 1, ..., 0, 1, 1])

In [None]:
predictions3 = mlpClassifier(X_train3, y_train, X_val3)
calF1Score(predictions3, y_val)

0.7264556515369428

RFC CALLING

In [None]:
predictions1 = rfcClassifier(X_train1, y_train, X_val1)
calF1Score(predictions1, y_val)

0.7297754633326382

In [None]:
predictions2 = rfcClassifier(X_train2, y_train, X_val2)
calF1Score(predictions2, y_val)

0.7299002834871859

In [None]:
predictions3 = rfcClassifier(X_train3, y_train, X_val3)
calF1Score(predictions3, y_val)

0.6819504245671364

svm calling

In [None]:
predictions1 = svmClassifier(X_train1, y_train, X_val1)
calF1Score(predictions1, y_val)

0.7626520016683495

In [None]:
predictions2 = svmClassifier(X_train2, y_train, X_val2)
calF1Score(predictions2, y_val)

0.7730263600899618

In [None]:
predictions3 = svmClassifier(X_train3, y_train, X_val3)
calF1Score(predictions3, y_val)

0.7472440574989823

XGB CALLING

In [None]:
predictions1 = xgbClassifier(X_train1, y_train, X_val1)
calF1Score(predictions1, y_val)

In [None]:
predictions2 = xgbClassifier(X_train2, y_train, X_val2)
calF1Score(predictions2, y_val)

0.7400556217274648

In [None]:
predictions3 = xgbClassifier(X_train3, y_train, X_val3)
calF1Score(predictions3, y_val)

GENERATING FINAL OUTPUT

We performed multiple experiments and found the Bert based embedding to outperform other therefore. For the generation of test predictions we used only the embeddings of the Bert model and the code is with respect to that embedding only

TRAIINING ON THE ENTIRE CORPUS

In [None]:
processed_translated_train_data.shape

(7593,)

In [None]:
processed_translated_test_data.shape

(844,)

In [None]:
X_train, X_test = feature_combining(processed_translated_train_data, processed_translated_test_data, user_ids_train, user_ids_test, prof_vector_train, prof_vector_test, X_train2, X_test2)

In [None]:
X_train.shape

In [None]:
X_train = standardize(X_train)
X_test = standardize(X_test)

WE ALSO GENERATED FINAL OUTPUTS FOR DIFFERENT MODELS WITH BERT SENTENCE EMBEDDING TO CHECK FOR THE POSSIBILITY THAT NEAR AROUND F1 SCORE ON THE EVALUATION SET MODELS MAY RESULT IN MORE RESULT FOR THE TEST SET WHEN WE UPLOAD ON THE KAGGLE

In [None]:
predictions1 = svmClassifier(X_train, y_train, X_test)

In [None]:
predictions2 = mlpClassifier(X_train, y_train, X_test)

In [None]:
predictions3 = xgbClassifier(X_train, y_train, X_test)

In [None]:
predictions4 = rfcClassifier(X_train, y_train, X_test)

In [None]:
predictions.shape

In [None]:
predictions2

In [None]:
testOutputs = pd.DataFrame(list(zip(predictions4.tolist(), [i for i in range(len(predictions4))])),\
                               columns=['label', 'id'])

In [None]:
testOutputs.to_csv("data/testOutputs.csv")