In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



The tweets are upload as CSV file into the jupyter notebook using read_csv function

In [3]:
train_data = pd.read_csv("MalayTweets.csv")
train_data

Unnamed: 0,Tweets,Polarity
0,#1255(HASH) 2/10/2015(C) 21:00:57(C) unikl(N) ...,positive
1,#TipsMasukU(HASH) unifi(KNK) broadband(KN) bro...,positive
2,#TipsMasukUiTM(HASH) pergi(KK) library(N) uitm...,positive
3,#TM(HASH) mengumumkan(KK) gangguan(KA) #intern...,negative
4,#TMConnect#Streamyx(HASH) sudah(KT) agak(KT) b...,positive
5,#Unifi slow gila babeng.,negative
6,#unifi(HASH) #admin(HASH) #sobsob(HASH) unifi(...,positive
7,#unifi(HASH) vip(N) 20(D) lagging(VB) gila(KA)...,positive
8,@__diba(MEN) then(ADV) kamu(N) belanja(N) pasa...,positive
9,@__ji95(MEN) hi(N) @=*=@ you(ADV) sudah(KT) cu...,negative


Info() function is used to get the summarization of the data uploaded.

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
Tweets      2000 non-null object
Polarity    2000 non-null object
dtypes: object(2)
memory usage: 31.3+ KB


head() function is used to show the rows in the data. Top 5 rows of the data are returned by default. The parameter is set to 10, to return 10 rows.

In [5]:
train_data.head(10)['Tweets']

0    #1255(HASH) 2/10/2015(C) 21:00:57(C) unikl(N) ...
1    #TipsMasukU(HASH) unifi(KNK) broadband(KN) bro...
2    #TipsMasukUiTM(HASH) pergi(KK) library(N) uitm...
3    #TM(HASH) mengumumkan(KK) gangguan(KA) #intern...
4    #TMConnect#Streamyx(HASH) sudah(KT) agak(KT) b...
5                             #Unifi slow gila babeng.
6    #unifi(HASH) #admin(HASH) #sobsob(HASH) unifi(...
7    #unifi(HASH) vip(N) 20(D) lagging(VB) gila(KA)...
8    @__diba(MEN) then(ADV) kamu(N) belanja(N) pasa...
9    @__ji95(MEN) hi(N) @=*=@ you(ADV) sudah(KT) cu...
Name: Tweets, dtype: object

The tweets are preprocessed using RegEx

In [6]:
import re
def process_tweet(Tweets):
    return " ".join(re.sub("\([A-Za-z]+\)|\@\=\*\=\@", " ",Tweets.lower()).split())

In [7]:
train_data['Tweets'] = train_data['Tweets'].apply(process_tweet)

In [8]:
train_data.head()

Unnamed: 0,Tweets,Polarity
0,#1255 2/10/2015 21:00:57 unikl bila hendak upg...,positive
1,#tipsmasuku unifi broadband broadband semua ma...,positive
2,#tipsmasukuitm pergi library uitm nescaya ada ...,positive
3,#tm mengumumkan gangguan #internet perlahan se...,negative
4,#tmconnect#streamyx sudah agak besok cuti panj...,positive


Train_test_split() function is used to split the Tweets into train and test set.

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train_data["Tweets"], train_data["Polarity"], test_size = 0.2, random_state = 3)

Load Malay stop words to eliminate stop words in the Tweets

In [10]:
def get_stop_words(stop_file_path):
    """load stop words"""
    with open(stop_file_path, 'r', encoding = 'utf-8') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [11]:
stopwords = get_stop_words("Malay_stopwords.txt")

In [12]:
print(stopwords)

frozenset({'media', 'global', 'negara-negara', 'proses', 'pekerja', 'kaunter', 'islam', 'nov', 'sistem', 'awam', 'unit', 'australia', 'ini', 'mempunyai', 'diniagakan', 'perdagangan', 'jun', 'penerbangan', 'menjadi', 'peningkatan', 'mohd', 'pinjaman', 'maklumat', 'tengah', 'perusahaan', 'bhd', 'barangan', 'terdapat', 'kumpulan', 'perkara', 'dari', 'menggalakkan', 'pada', 'derivatives', 'ditutup', 'utama', 'lumpur', 'katanya', 'ketua', 'syarikat', 'merosot', 'kemudahan', 'untuk', 'paras', 'akhbar', 'sekuriti', 'sebelumnya', 'apabila', 'amerika', 'april', 'terbuka', 'langkah', 'mana', 'bahan', 'kenyataan', 'negeri', 'tempoh', 'okt', 'tiga', 'wang', 'kata', 'jumaat', 'sudah', 'melihat', 'kesihatan', 'eksport', 'bulan', 'dr', 'lebih', 'sementara', 'tan', 'berubah', 'tanah', 'timur', 'sebarang', 'parlimen', 'baik', 'jumlah', 'kuasa', 'najib', 'dana', 'mengenai', 'pembangunan', 'baru-baru', 'ramai', 'pilihan', 'mendapatkan', 'modal', 'kini', 'prestasi', 'amat', 'sokongan', 'walaupun', 'menari

CountVectorizer is used to convert the collection of tweets to a matrix of token counts.This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix. TfidfTransformer() is used transform the count matrix to a normalized tf-idf representation.

fit_transform() is used to fit to data, then transform it.

In [13]:
count_vect = CountVectorizer(stop_words=stopwords)
transformer = TfidfTransformer(norm=None,sublinear_tf=True)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [14]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

n_estimators = The number of trees in the forest.

In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train_tfidf,y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
predictions = model.predict(x_test_tfidf)

print(predictions)

['negative' 'negative' 'negative' 'negative' 'positive' 'positive'
 'positive' 'negative' 'negative' 'negative' 'negative' 'positive'
 'negative' 'positive' 'negative' 'positive' 'positive' 'negative'
 'negative' 'positive' 'positive' 'positive' 'negative' 'positive'
 'negative' 'negative' 'negative' 'negative' 'positive' 'positive'
 'negative' 'negative' 'negative' 'negative' 'negative' 'positive'
 'positive' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive' 'positive' 'negative' 'positive' 'negative' 'positive'
 'positive' 'negative' 'negative' 'negative' 'positive' 'positive'
 'positive' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive' 'negative' 'positive' 'negative' 'negative' 'positive'
 'positive' 'positive' 'negative' 'negative' 'negative' 'positive'
 'negative' 'negative' 'positive' 'positive' 'negative' 'positive'
 'negative' 'negative' 'positive' 'positive' 'positive' 'negative'
 'positive' 'negative' 'negative' 'positive' 'negative' 'posit

Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.

In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.745
