In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
import os
import xgboost as xgb
import numpy as np

In [5]:
import re

In [6]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stemmer = SnowballStemmer(language='english')

In [8]:
stop_words = set(stopwords.words("english"))

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/samael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
directory = "./data"
data = pd.DataFrame(columns=['tweet_id', 'tweet_text', 'class_label'])

In [11]:
for path, subdir, files in os.walk(directory):
    if len(files) == 0:
        continue
    for file in files:
        df = pd.read_csv(os.path.join(path, file), sep='\t')
        data = pd.concat([data, df])
    break

In [12]:
data.head()

Unnamed: 0,tweet_id,tweet_text,class_label
0,798262465234542592,RT @MissEarth: New Zealand need our prayers af...,sympathy_and_support
1,771464543796985856,"@johnaglass65 @gordonluke Ah, woke up to a nig...",caution_and_advice
2,797835622471733248,RT @terremotocentro: #eqnz if you need a tool ...,requests_or_urgent_needs
3,798021801540321280,RT @BarristerNZ: My son (4) has drawn a pictur...,other_relevant_information
4,798727277794033664,Due to earthquake damage our Defence Force is ...,infrastructure_and_utility_damage


In [13]:
def clean_text(each_text):

    # remove URL from text
    each_text_no_url = re.sub(r"http\S+", "", each_text)
    
    # remove numbers from text
    text_no_num = re.sub(r'\d+', '', each_text_no_url)

    # tokenize each text
    word_tokens = word_tokenize(text_no_num)
    
    # remove sptial character
    clean_text = []
    for word in word_tokens:
        clean_text.append("".join([e for e in word if e.isalnum()]))

    # remove stop words and lower
    text_with_no_stop_word = [w.lower() for w in clean_text if not w in stop_words]  

    # do stemming
    stemmed_text = [stemmer.stem(w) for w in text_with_no_stop_word]
    
    return " ".join(" ".join(stemmed_text).split())


data['processed_tweet'] = data['tweet_text'].apply(clean_text)

In [14]:
X = data.processed_tweet
y = data.class_label

In [15]:
# y.factorize()[0]

In [16]:
y = pd.factorize(y)[0]

In [17]:
y

array([0, 1, 2, ..., 4, 0, 7])

In [18]:
values, counts = np.unique(y, return_counts=True)

In [19]:
values

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [20]:
max_element = counts.argmax()

In [21]:
make2class = np.vectorize(lambda x: 1 if x==max_element else 0)

In [22]:
y = make2class(y)

In [23]:
un, counts2 =np.unique(y, return_counts=True)

In [24]:
counts2

array([1702,  493])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [26]:
vectorizer = CountVectorizer()

In [27]:
X_train = vectorizer.fit_transform(X_train)

In [28]:
X_test = vectorizer.transform(X_test)

In [29]:
model = xgb.XGBClassifier()

In [30]:
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
from sklearn import metrics 

In [33]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       532
           1       0.73      0.69      0.71       127

    accuracy                           0.89       659
   macro avg       0.83      0.82      0.82       659
weighted avg       0.89      0.89      0.89       659



In [34]:
from sklearn import ensemble

In [35]:
gbc_model = ensemble.GradientBoostingClassifier(learning_rate=0.01,                                            
                                            n_estimators=2500,
                                            max_depth=12,
                                            min_samples_split=8,
                                            min_samples_leaf=4,
                                            max_features=2,
                                            subsample=0.9)

In [36]:
gbc_model.fit(X_train, y_train)

In [37]:
predicted_prob = model.predict_proba(X_test)[:,1]
predicted = model.predict(X_test)

In [38]:
accuracy = metrics.accuracy_score(predicted, y_test)

In [39]:
accuracy

0.8907435508345979

In [40]:
df = pd.DataFrame()

In [41]:
df['class'] = y_test
df['predicted_class'] = predicted
df['predicted_xgboost'] = y_pred

In [42]:
df.head()

Unnamed: 0,class,predicted_class,predicted_xgboost
0,0,0,0
1,0,0,0
2,1,0,0
3,0,0,0
4,1,1,1


In [43]:
df.to_csv("boosting.csv")