In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict,GridSearchCV,StratifiedKFold
import xgboost as xgb
from sklearn.preprocessing import FunctionTransformer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,accuracy_score,classification_report
#py.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyabratasen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyabratasen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df=pd.read_csv('train.csv')
df=df.rename(columns={'id':'id','keyword':'keyword','location':'location','text':'text','target':'target'})

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:

stop_words = stopwords.words('english')
lemmatizer=WordNetLemmatizer()



def process(s):
    #s = re.sub(r'[0-9]+', '', s)
    s = re.sub(r'@\w+', '', s)
    s = re.sub(r'http\w+', '', s)
    s = re.sub(r'www.[^ ]+', '', s)
    #s = re.sub(r'[\W\_]', ' ', s)
    s = re.sub(r'''[¬!"#$%&()*+,-./:;<=>?@[\]'^'_`\{|}~]''', '', s)
    return s

def stop_word(words):
    token=re.split('\W+',words)
    txt=[word for word in token if word not in stop_words]
    return txt

def lemmatization(words):
    txt=[lemmatizer.lemmatize(word) for word in words]
    return txt

def final_text(words):
    return ' '.join(words)

In [7]:
class Model:
    
    def __init__(self,df):
        self.df=df
    
    # creating X and y variable and building the trian test split
    def train_test(self):
        X=self.df.iloc[:,1]
        y=self.df.iloc[:,0]
        X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)
        return [X_train,X_test,y_train,y_test,X,y]


    # stratified K-Fold cross validation
    def cross_val(self):
        val=StratifiedKFold(n_splits=3)
        return val
    
    
    # creating a pipeline to create verctorization, classification model and Stratified k-fold cross validation
    def score(self):
        
        models={'SVC':svm.SVC(),'Random_Forest':RandomForestClassifier(),'XGboost':xgb.XGBClassifier(verbosity=0),'Naive':MultinomialNB(),'Decision_Tree':DecisionTreeClassifier()}
        z={}


        for key,item in models.items():
            text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',item)])
            score=cross_val_score(text_clf,self.train_test()[0],self.train_test()[2],cv=self.cross_val(),scoring='accuracy')
            z.update({key:list(score)})
        df_score=pd.DataFrame(z)
        return df_score

    #Evaluating the model on test set
    def evaluation(self):
        models={'SVC':svm.SVC(),'Random_Forest':RandomForestClassifier(),'XGboost':xgb.XGBClassifier(verbosity=0),'Naive':MultinomialNB(),'Decision_Tree':DecisionTreeClassifier()}
        x=[]
        y=[]

        for key,item in models.items():
    
            text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',item)])
            text_clf.fit(self.train_test()[0],self.train_test()[2])
            y_pred=text_clf.predict(self.train_test()[1])
            score=accuracy_score(y_true=self.train_test()[3],y_pred=y_pred)
            x.append(score)
            y.append(key)
        df_score=pd.DataFrame({'model':y,'Accuarcy Score':x})
        return df_score


    #Hyperparameter optimization
    def hyper_param(self):

        model_param={
            'clf__kernel':('linear', 'rbf'), 
            'clf__C':[1,5,10,20], 
            'clf__gamma':['scale','auto'],
            'clf__decision_function_shape':['ovo','ovr']}
                    
        text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',svm.SVC())])
        grid=GridSearchCV(text_clf,model_param,cv=self.cross_val(),return_train_score=False)
        grid.fit(self.train_test()[0],self.train_test()[2])
        y_pred=grid.predict(self.train_test()[1])
        accuracy=accuracy_score(self.train_test()[3],y_pred)

        return 'The best score for grid search is {},Accuracy on test set is {}'.format(grid.best_score_,accuracy)          
    

In [8]:

df['modified_text']=df['text'].apply(lambda x: process(x)).apply(lambda x: stop_word(x)).apply(lambda x: lemmatization(x)).apply(lambda x: final_text(x)).str.lower()
df=df[['id','keyword','location','text','modified_text','target']]
df['word_count'] = df['modified_text'].apply(lambda x: len(x.split()))
df['unique_word_count'] = df['modified_text'].apply(lambda x: len(set(x.split())))
df['mean_word_len'] = df['modified_text'].apply(lambda x: np.mean([len(w) for w in x.split()]))
df['char_count']=df['modified_text'].apply(lambda x: len(x))
df1=df[['target','modified_text']]
df2=df.drop(['id','keyword','location','text'],axis=1)[['target','modified_text','word_count','unique_word_count','mean_word_len','char_count']]


In [10]:
def get_numeric(x):
    numerical=list(x.select_dtypes('number').columns)
    return x.loc[:,numerical]

def get_categorical(x):
    numerical=list(x.select_dtypes('object').columns)
    return x.loc[:,numerical]

transfomer_numeric = FunctionTransformer(get_numeric)
transformer_text = FunctionTransformer(get_categorical)


X,y=df2.iloc[:,1:],df2.iloc[:,0]
# categorical=list(X.select_dtypes('object').columns)
# numerical=list(X.select_dtypes('number').columns)


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer(analyzer='word'))
            ]))
        ])),
    ('clf', RandomForestClassifier())
])

In [30]:
# model=Model(df1)

def get_numeric_data(x):
    return [record[1:].astype(float) for record in x]

def get_categorical_data(x):
    return [record[:1].astype(object) for record in x]

get_categorical_data(X.to_numpy())




[array(['our deeds reason earthquake may allah forgive u'], dtype=object),
 array(['forest fire near la ronge sask canada'], dtype=object),
 array(['all resident asked shelter place notified officer no evacuation shelter place order expected'],
       dtype=object),
 array(['13000 people receive wildfire evacuation order california '],
       dtype=object),
 array(['just got sent photo ruby alaska smoke wildfire pours school '],
       dtype=object),
 array(['rockyfire update california hwy 20 closed direction due lake county fire cafire wildfire'],
       dtype=object),
 array(['flood disaster heavy rain cause flash flooding street manitou colorado springs area'],
       dtype=object),
 array(['im top hill i see fire wood'], dtype=object),
 array(['theres emergency evacuation happening building across street'],
       dtype=object),
 array(['im afraid tornado coming area'], dtype=object),
 array(['three people died heat wave far'], dtype=object),
 array(['haha south tampa getting floo

In [49]:
X.loc[:,numerical]
cat_pipeline=Pipeline([('verctorizer',TfidfVectorizer())])

Unnamed: 0,word_count,unique_word_count,mean_word_len,char_count
0,8,8,5.000000,47
1,7,7,4.428571,37
2,13,11,6.153846,92
3,7,7,7.285714,58
4,10,10,5.000000,60
...,...,...,...,...
7608,9,9,6.777778,69
7609,11,11,5.636364,73
7610,7,7,6.428571,51
7611,14,13,7.000000,111
