In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
import os
import xgboost as xgb

In [31]:
import re

In [30]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samael/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [33]:
stemmer = SnowballStemmer(language='english')

In [51]:
stop_words = set(stopwords.words("english"))

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/samael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
directory = "./data"
data = pd.DataFrame(columns=['tweet_id', 'tweet_text', 'class_label'])

In [3]:
for path, subdir, files in os.walk(directory):
    if len(files) == 0:
        continue
    for file in files:
        df = pd.read_csv(os.path.join(path, file), sep='\t')
        data = pd.concat([data, df])
    break

In [47]:
data.head()

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage


In [54]:
def clean_text(each_text):

    # remove URL from text
    each_text_no_url = re.sub(r"http\S+", "", each_text)
    
    # remove numbers from text
    text_no_num = re.sub(r'\d+', '', each_text_no_url)

    # tokenize each text
    word_tokens = word_tokenize(text_no_num)
    
    # remove sptial character
    clean_text = []
    for word in word_tokens:
        clean_text.append("".join([e for e in word if e.isalnum()]))

    # remove stop words and lower
    text_with_no_stop_word = [w.lower() for w in clean_text if not w in stop_words]  

    # do stemming
    stemmed_text = [stemmer.stem(w) for w in text_with_no_stop_word]
    
    return " ".join(" ".join(stemmed_text).split())


data['processed_tweet'] = data['tweet_text'].apply(clean_text)

In [55]:
X = data.processed_tweet
y = data.class_label

In [56]:
y.factorize()[0]

array([0, 1, 2, ..., 4, 0, 7])

In [57]:
y = pd.factorize(y)[0]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [59]:
vectorizer = CountVectorizer()

In [60]:
X_train = vectorizer.fit_transform(X_train)

In [61]:
X_test = vectorizer.transform(X_test)

In [62]:
model = xgb.XGBClassifier()

In [63]:
model.fit(X_train, y_train)

In [64]:
y_pred = model.predict(X_test)

In [65]:
from sklearn import metrics 

In [70]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80       127
           1       0.64      0.71      0.68       150
           2       0.20      0.08      0.12        12
           3       0.38      0.35      0.36        99
           4       0.72      0.84      0.77        98
           5       0.79      0.79      0.79        33
           6       0.69      0.77      0.73        56
           7       0.35      0.26      0.30        65
           8       0.85      0.89      0.87        19

    accuracy                           0.65       659
   macro avg       0.61      0.61      0.60       659
weighted avg       0.63      0.65      0.64       659



In [72]:
from sklearn import ensemble

In [84]:
gbc_model = ensemble.GradientBoostingClassifier(learning_rate=0.1,                                            
                                            n_estimators=2500,
                                            max_depth=12,
                                            min_samples_split=8,
                                            min_samples_leaf=4,
                                            max_features=9,
                                            subsample=0.9)

In [85]:
gbc_model.fit(X_train, y_train)

In [86]:
predicted_prob = model.predict_proba(X_test)[:,1]
predicted = model.predict(X_test)

In [87]:
accuracy = metrics.accuracy_score(predicted, y_test)

In [88]:
accuracy

0.6464339908952959