# Character Level Processing and predicting target

In [54]:
import nltk
import pandas as pd
import numpy as np
import re
import pickle

from nltk.stem import PorterStemmer

In [21]:
tweets_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
texts = list(tweets_data["text"])
texts_test = list(test_data["text"])

In [55]:
chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q', 'r','s','t','u','v','w','x','y','z','$']
twins = [ char1+char2 for char1 in chars for char2 in chars]

twins_dict = { twins[i]:i for i in range(len(twins)) }
print(len(twins))

729


In [56]:
def convert_to_twins(texts):
    new_texts = []
    for text in texts:
        text=text.lower()
        text_tokens = text.split()
        text_tokens = [re.sub(r'[^a-z]', '', word) for word in text_tokens] # remove special characters
        new_texts.append("$".join(text_tokens))
    twins_tweets = []
    for new_text in new_texts:
        tweet_twins=[]
        for i in range(len(new_text)-1):
            my_twin = new_text[i:i+2]
            tweet_twins.append(twins_dict[my_twin])
        twins_tweets.append(tweet_twins)
    return twins_tweets

train_twins_tweets = convert_to_twins(texts)
test_twins_tweets = convert_to_twins(texts_test)

In [57]:
# Create tf-idf matrix for the training data and test data

twin_tweet_freq = {}
for tweet in train_twins_tweets + test_twins_tweets:
    for twin in set(tweet):
        if twin in twin_tweet_freq:
            twin_tweet_freq[twin]+=1
        else:
            twin_tweet_freq[twin]=1
idf = { twin:np.log(len(train_twins_tweets+test_twins_tweets)/twin_tweet_freq[twin]) for twin in twin_tweet_freq }

def tf_idf(tweet):
    tf = [ 0 for i in range(len(twins))]
    for twin_id in set(tweet):
        # print(twin_id)
        tf[twin_id] = tweet.count(twin_id)/len(tweet)
    return [ tf[twin]*idf[twin] for twin in range(len(twins)) ]

train_tf_idf = [ tf_idf(tweet) for tweet in train_twins_tweets ]
test_tf_idf = [ tf_idf(tweet) for tweet in test_twins_tweets ]



In [58]:
# Apply Random Forest Classifier and XGBoost Classifier after splitting training data into training and validation data
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train = train_tf_idf
y_train = list(tweets_data["target"])

# X_train, X_val, y_train, y_val = train_test_split(train_tf_idf, tweets_data["target"], test_size=0.1, random_state=42)

rfc = RandomForestClassifier(max_depth=2, random_state=0)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_val)
print("Random forest accuracy ", accuracy_score(y_val, y_pred))


xgc = XGBClassifier()
xgc.fit(X_train, y_train)
# y_pred = rfc.predict(X_val)
# print("XGBoost accuracy ", accuracy_score(y_val, y_pred))


Random forest accuracy  0.652230971128609


In [87]:
# Implement a neural network of 2 hidden layers with ReLU activation function over the tf-idf matrix to predict the target 
#   using sequential model from keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD

# different axtivation layers: https://keras.io/api/layers/activations/
model = Sequential([
    Dense(256, input_shape=(len(twins), )),
    Activation('softmax'),
    Dense(64),
    Activation('relu'),
    Dense(1),
    Activation('sigmoid'),
])

model.compile(optimizer='sgd',
                loss='mean_squared_error',
                metrics=['accuracy'])

model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=16)
pickle.dump(model, open("neural_model.pkl", "wb"))

# y_pred = model.predict(np.array(X_val))
# print("Neural network accuracy ", model.score(np.array(X_val), np.array(y_val)))

# Create a submission file for the test data
# y_pred = model.predict_classes(np.array(test_tf_idf))
# y_pred = model.preict(np.array(test_tf_idf))
# submission = pd.DataFrame()
# submission["id"] = test_data["id"]
# submission["target"] = y_pred
# submission.to_csv("submission.csv", index=False)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
y_pred = model.predict(np.array(test_tf_idf))
submission = pd.DataFrame()
submission["id"] = test_data["id"]
target = np.array(y_pred) #[1 if (y_pr >= 0.5) else 0 for y_pr in y_pred]
min_t = min(target)
max_t = max(target)
print(sum(y_train)/len(y_train))

target = [ (val-min_t)/(max_t-min_t) for val in target]

submission["target"] = [1 if val>0.605 else 0 for val in target]
print(sum(submission["target"])/len(submission["target"]))
# print(submission["target"])
# max(submission["target"])
# print(min(submission["target"]))
submission.to_csv("data/neural_submission.csv", index=False)


0.4296597924602653
0.4241495556236592
