Create a classifier for predicting sentiment using social media messages in Thai language. 

### Notes:
* This is a classification problem
* Data source: https://github.com/PyThaiNLP/wisesight-sentiment/
* Other models using this dataset: https://github.com/PyThaiNLP/wisesight-sentiment/tree/master/kaggle-competition 
* Steps:    
    1. Download the data
    2. Visualise the data
    3. (If needed) clean, normalise, transform and visualise the data to get a feeling about potential relationships between the variables
    4. Try to find the most relevant features (feature selection)
    5. Try to create new features
    6. Agree on a metric (i.e. Precision and recall)
    7. Develop a classification model (start with simple models and then grow in complexity)    
    8. Analyse the results for train + test data and compare different models

In [1]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from pythainlp import word_tokenize

In [2]:
# f = open("ex3-data/neg.txt")
# neg_comments = f.read().splitlines()
# f.close()
with open('ex3-data/neg.txt', 'r') as f:
    neg_comments = f.read().splitlines()

with open('ex3-data/pos.txt', 'r') as f:
    pos_comments = f.read().splitlines()

with open('ex3-data/neu.txt', 'r') as f:
    neu_comments = f.read().splitlines()

In [3]:
neg_comments_with_labels = list(map(lambda x: (x, 'negative'), neg_comments))
pos_comments_with_labels = list(map(lambda x: (x, 'positive'), pos_comments))
neu_comments_with_labels = list(map(lambda x: (x, 'neutral'), neu_comments))

comments = neg_comments_with_labels + pos_comments_with_labels + neu_comments_with_labels

df = pd.DataFrame(data=comments,columns=['comments', 'label'])
df.head()


Unnamed: 0,comments,label
0,☹️,negative
1,😔,negative
2,😞,negative
3,😥,negative
4,รำ,negative


In [4]:
MIN_COMMENT_LEN = 5

comments = ""
for comment in df['comments'].values:
    if len(word_tokenize(comment)) > MIN_COMMENT_LEN:
        comments += " " + comment

In [6]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
tfidf.fit(comments.split())

TfidfVectorizer(stop_words='english',
                tokenizer=<function word_tokenize at 0x7fda311e81f0>)

In [23]:
X = []
y = []
for ix, comment in enumerate(df['comments'].values):
    tokenized = word_tokenize(comment)
    if len(tokenized) > MIN_COMMENT_LEN:
        X.append(tfidf.transform([comment]))
        y.append(df['label'][ix])


In [35]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [36]:
X_train = list(map(lambda x: x.toarray()[0], X_train))
X_test = list(map(lambda x: x.toarray()[0], X_test))

In [37]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [38]:
clf.score(X_test, y_test)

0.7087184611541344