In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import nltk
from nltk.corpus import stopwords,wordnet
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict,GridSearchCV,StratifiedKFold
import xgboost as xgb
from sklearn.preprocessing import FunctionTransformer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,accuracy_score,classification_report
#py.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PSEN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PSEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df=pd.read_csv('train.csv')
df=df.rename(columns={'id':'id','keyword':'keyword','location':'location','text':'text','target':'target'})

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:

stop_words = stopwords.words('english')
lemmatizer=WordNetLemmatizer()



def process(s):
    #s = re.sub(r'[0-9]+', '', s)
    s = re.sub(r'@\w+', '', s)
    s = re.sub(r'http\w+', '', s)
    s = re.sub(r'www.[^ ]+', '', s)
    #s = re.sub(r'[\W\_]', ' ', s)
    s = re.sub(r'''[Â¬!"#$%&()*+,-./:;<=>?@[\]'^'_`\{|}~]''', '', s)
    return s.lower()

def stop_word(words):
    token=re.split('\W+',words)
    txt=[word for word in token if word not in stop_words]
    return txt

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def lemmatization(words):
    txt=[lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in words]
    return txt

def final_text(words):
    return ' '.join(words)


def model_union(transfomer_numeric,transformer_text,X,y):
    pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer(analyzer='word'))
            ]))
        ])),
    ('clf', RandomForestClassifier())])


    # Grid Search Parameters for RandomForest
    # param_grid = {'clf__n_estimators': np.linspace(1, 100, 10, dtype=int),
    #             'clf__min_samples_split': [3, 10],
    #             'clf__min_samples_leaf': [3],
    #             'clf__max_features': [7],
    #             'clf__max_depth': [None],
    #             'clf__criterion': ['gini'],
    #             'clf__bootstrap': [False]}

    kfold = StratifiedKFold(n_splits=3)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'


    # rf_model = GridSearchCV(pipeline, param_grid=param_grid, cv=kfold, scoring=scoring,refit=refit, n_jobs=-1, return_train_score=True, verbose=1)
    # rf_model.fit(X, y)
    # rf_best = rf_model.best_estimator_

    score=cross_val_score(pipeline,X=X,y=y,cv=kfold,scoring='accuracy')
    return score

In [5]:

df['modified_text']=df['text'].apply(lambda x: process(x)).apply(lambda x: stop_word(x)).apply(lambda x: lemmatization(x)).apply(lambda x: final_text(x))
df=df[['id','keyword','location','text','modified_text','target']]
df['word_count'] = df['modified_text'].apply(lambda x: len(x.split()))
df['unique_word_count'] = df['modified_text'].apply(lambda x: len(set(x.split())))
df['mean_word_len'] = df['modified_text'].apply(lambda x: np.mean([len(w) for w in x.split()]))
df['char_count']=df['modified_text'].apply(lambda x: len(x))
df1=df[['target','modified_text']]
df2=df.drop(['id','keyword','location','text'],axis=1)[['target','modified_text','word_count','unique_word_count','mean_word_len','char_count']]


In [6]:
df.head()

Unnamed: 0,id,keyword,location,text,modified_text,target,word_count,unique_word_count,mean_word_len,char_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,1,7,7,5.142857,42
1,4,,,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,1,7,7,4.428571,37
2,5,,,All residents asked to 'shelter in place' are ...,resident ask shelter place notify officer evac...,1,11,9,6.272727,79
3,6,,,"13,000 people receive #wildfires evacuation or...",13000 people receive wildfire evacuation order...,1,7,7,7.285714,58
4,7,,,Just got sent this photo from Ruby #Alaska as ...,get sent photo ruby alaska smoke wildfire pour...,1,9,9,5.111111,55


In [7]:
# model=Model(df1)

X=df2[['word_count','unique_word_count','mean_word_len','char_count','modified_text']]
y=df2['target']


transfomer_numeric = FunctionTransformer(lambda x: x[['word_count','unique_word_count','mean_word_len','char_count']],validate=False)
transformer_text = FunctionTransformer(lambda x: x['modified_text'],validate=False)

model_union(transfomer_numeric=transfomer_numeric,transformer_text=transformer_text,X=X,y=y)



array([0.7249803 , 0.67100079, 0.70752858])