In [260]:
pip install lightgbm




In [261]:
# important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from math import sqrt
import string
import re

In [262]:
# Importing dataset

train = pd.read_csv('https://raw.githubusercontent.com/SiboneloJunior/To-Vaccinate-or-Not-to-Vaccinate-It-s-not-a-Question/master/Dataset/Train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/SiboneloJunior/To-Vaccinate-or-Not-to-Vaccinate-It-s-not-a-Question/master/Dataset/Test.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/SiboneloJunior/To-Vaccinate-or-Not-to-Vaccinate-It-s-not-a-Question/master/Dataset/SampleSubmission.csv')

#### Dataframe Lookup

In [263]:
# Shape

print(train.shape)
print(test.shape)
print(sample.shape)

(10001, 4)
(5177, 2)
(5177, 2)


In [264]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [265]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [266]:
sample.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,0
1,00UNMD0E,0
2,01AXPTJF,0
3,01HOEQJW,0
4,01JUKMAO,0


#### Preprocessing

In [267]:
# Remove nulls

train = train.dropna()
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [268]:
# Fill the null on the test dataset witha random string

test = test.fillna('CDC eyeing bird flu vaccine for humans')

In [269]:
# Drop irrelevant columns

train = train.drop(['tweet_id'], axis=1)

In [270]:
# Get all rows with reasonable agreement

train = train[train['agreement'] > 0.5]

In [271]:
# Convert labels into integers

label_int = train['label'].astype(int)
train['label'] = label_int

#### General text cleaning

In [272]:
# Drop duplicated tweets

train = train.drop_duplicates(subset='safe_text', keep="first")

In [273]:
# Define the Lemmetizer class

lemma = WordNetLemmatizer()

In [274]:
#  Define Punctuations + User string

punctuation = list(string.punctuation)

sw_pun = punctuation + ['user']

In [275]:
# Cleaning function_1

def preprocess(tweet):
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+",
                   "", tweet)                       # removing urls
    tweet = re.sub('[^\w]',' ',tweet)               # remove embedded special characters        
    tweet = re.sub('[\d]','',tweet)                 # this will remove numeric characters
    tweet = tweet.lower()
    words = tweet.split()  
    sentence = ""
    for word in words:     
        if word not in (sw_pun):                    # removing punctuations + 'User'                
            word = lemma.lemmatize(word,pos = 'v')  # converting to lemma    
            if len(word) > 3:                       # we will consider words with length > 3
                sentence = sentence + word + ' '             
    return(sentence)

In [276]:
# Apply cleaning function_1 to train and test datasets

train['safe_text'] = train['safe_text'].apply(lambda s : preprocess(s))
test['safe_text'] = test['safe_text'].apply(lambda s : preprocess(s))

In [277]:
# Cleaning function_2 - emojis

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [278]:
# Apply cleaning function_1 to train and test datasets

train['safe_text'] = train['safe_text'].apply(lambda s : remove_emoji(s))
test ['safe_text'] = test ['safe_text'].apply(lambda s : remove_emoji(s))

#### Feature engineering

In [279]:
# Feature engineering function

def add_features(df) :
    
    punctuation=string.punctuation
    stop = stopwords.words('english')
    
    df['safe_text']=df['safe_text'].astype('category')
    df['word_count']=df['safe_text'].apply(lambda x: len(str(x).split(" ")))
    df['char_count'] = df['safe_text'].str.len()

    df['stopwords'] = df['safe_text'].apply(lambda x: len([x for x in x.split() if x in stop]))
    df['word_density'] = df['char_count'] / (df['word_count']+1)
    
    return df

In [280]:
# Apply function to both train and test

train = add_features(train)
test  = add_features(test)

In [281]:
# Define columns

col=['word_count', 'char_count',
 'stopwords', 'word_density']

#### Model training + testing

In [282]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

v_name = TfidfVectorizer(ngram_range=(1,1),stop_words="english", analyzer='word')
name_tr =v_name.fit_transform(train['safe_text'])
name_ts =v_name.transform(test['safe_text'])

In [283]:
from scipy.sparse import csr_matrix
from scipy import sparse

features_train = sparse.hstack((train[col],name_tr )).tocsr()
features_test = sparse.hstack((test[col],name_ts )).tocsr()

In [284]:
X= features_train
y= train['label']

X_train,X_val,y_train,y_val = train_test_split(X, y, stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.25, shuffle=True)

In [285]:
from lightgbm import LGBMRegressor

lg = LGBMRegressor(random_state=42,learning_rate=0.05)

lg.fit(X_train,y_train)

y_pred = lg.predict(X_val)

print(np.sqrt(mean_squared_error(y_train.values,lg.predict(X_train))))
print(np.sqrt(mean_squared_error(y_val.values,y_pred)))

0.4877422070897596
0.5543156389774229


#### Submission

In [286]:
X = features_test

y_sub = lg.predict(X)

In [287]:
sample['label'] = y_sub

In [288]:
sample

Unnamed: 0,tweet_id,label
0,00BHHHP1,0.252449
1,00UNMD0E,0.612813
2,01AXPTJF,0.112282
3,01HOEQJW,0.507697
4,01JUKMAO,0.323991
...,...,...
5172,ZXVVNC5O,0.912599
5173,ZYIANVI8,0.077235
5174,ZYITEHAH,0.446669
5175,ZZ3BMBTG,0.781136


In [289]:
sample.to_csv('model_v9.csv' , index = False)

#### Zindi Leaderboard score = 0.5867