In [1]:
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
import sys

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io

train_identification = pd.read_csv('dm2020-hw2-nthu/data_identification.csv')
train_emotion = pd.read_csv('dm2020-hw2-nthu/emotion.csv')
sample_submit = pd.read_csv('dm2020-hw2-nthu/sampleSubmission.csv')
json_feature = pd.read_json('dm2020-hw2-nthu/tweets_DM.json', lines=True)
data = pd.merge(train_identification, train_emotion, on=["tweet_id"])

In [2]:
identification_train = data['identification'] == 'train'
identification_test = train_identification['identification'] == 'test'

train_X = data.loc[identification_train]
test_X = train_identification.loc[identification_test]

test_X['emotion'] = np.nan
df = pd.concat([train_X, test_X])

json_feature['hashtags'] = np.nan; json_feature['text'] = np.nan
json_extend = pd.io.json.json_normalize(json_feature._source)
json_extend['tweet.text'].isna().sum()
json_feature['hashtags'] = json_extend['tweet.hashtags']
json_feature['text'] = json_extend['tweet.text']
json_feature['tweet_id'] = json_extend['tweet.tweet_id']

df = pd.merge(df, json_feature, on=["tweet_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
  # This is added back by InteractiveShellApp.init_path()


In [3]:
df_train = df.loc[df['identification'] == 'train']
df_test = df.loc[df['identification'] == 'test']

df_train['emotion'].unique()

array(['joy', 'trust', 'anticipation', 'sadness', 'disgust', 'fear',
       'surprise', 'anger'], dtype=object)

In [4]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_train['emotion_enc'] = labelencoder.fit_transform(df_train['emotion'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = TfidfVectorizer(#min_df=150, 
                     max_features=5000000, 
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     stop_words='english', 
                     ngram_range=(1,2),
)

cv2 = TfidfVectorizer(sublinear_tf=True,
                      strip_accents='unicode',
                      analyzer='char',
                      stop_words='english',
                      ngram_range=(2,6),
                      max_features=5000000,
)

word = cv.fit_transform(df_train['text'])
char = cv2.fit_transform(df_train['text'])

In [48]:
from scipy.sparse import hstack
text= hstack([word, char])


In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text,
                                                    df_train['emotion'], 
                                                    test_size=0.30, 
                                                    random_state=5)

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifierCV

mnb = LogisticRegression(solver='sag')
mnb_tuning = MultinomialNB()


In [43]:
mnb_tuning.fit(X_train, y_train)
predicted = mnb_tuning.predict(X_test)
pred = mnb_tuning.predict(X_train)

In [44]:
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report

acc_test_score = metrics.accuracy_score(predicted,y_test)
acc_train_score = metrics.accuracy_score(pred,y_train)

prec_score = precision_score(y_test,predicted, average='macro')
recall = recall_score(y_test, predicted,average='macro')
f1 = f1_score(y_test,predicted,average='macro')
matrix = confusion_matrix(y_test,predicted)

print(str('acc_train_score: '+'{:04.2f}'.format(acc_train_score*100))+'%')
print(str('acc_train_score: '+'{:04.2f}'.format(acc_test_score*100))+'%')

print(str('Precision: '+'{:04.2f}'.format(prec_score*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall*100))+'%')
print('F1 Score: ',f1)
print(matrix)
'''
acc_train_score: 57.52%
acc_train_score: 57.53%
Precision: 60.35%
Recall: 43.45%
update max
acc_train_score: 59.12%
acc_train_score: 57.86%
Precision: 58.85%
Recall: 44.77%

----iterator 10000----
acc_train_score: 56.07%
acc_train_score: 55.09%
Precision: 53.01%
Recall: 40.25%

logist -> 25000
slove=sag
acc_train_score: 62.72%
acc_train_score: 59.40%

-> 30000
acc_train_score: 63.07%
acc_train_score: 59.54%

- 100000
bay
acc_train_score: 56.45%
acc_train_score: 53.93%
- 1000000
acc_train_score: 58.66%
acc_train_score: 54.61%
'''


acc_train_score: 58.66%
acc_train_score: 54.61%
Precision: 74.01%
Recall: 34.77%
F1 Score:  0.38838487803609223
[[  1058    493   1040     10   6574   2594      2    130]
 [     1  38700    733     27  31050   2938      8   1286]
 [     3   1099  11659     38  18065  10279      6    416]
 [     0   1214    547   3563  11676   1928      0    225]
 [     2   6991   1146     64 140032   3639     12   2946]
 [     1   1571   2933     50  25114  28108      9    504]
 [     0    540    816     14   9081   2730   1401    189]
 [     0   5157    740     12  39452   2091      6  13956]]


'\nacc_train_score: 57.52%\nacc_train_score: 57.53%\nPrecision: 60.35%\nRecall: 43.45%\nupdate max\nacc_train_score: 59.12%\nacc_train_score: 57.86%\nPrecision: 58.85%\nRecall: 44.77%\n\n----iterator 10000----\nacc_train_score: 56.07%\nacc_train_score: 55.09%\nPrecision: 53.01%\nRecall: 40.25%\n\nlogist -> 25000\nslove=sag\nacc_train_score: 62.72%\nacc_train_score: 59.40%\n\n-> 30000\nacc_train_score: 63.07%\nacc_train_score: 59.54%\n\n- 100000\nbay\nacc_train_score: 56.45%\nacc_train_score: 53.93%\n'

In [49]:
mnb.fit(text, df_train['emotion'])

pred_word = cv.transform(df_test['text'])
pred_char = cv2.transform(df_test['text'])
pred_text = hstack([pred_word, pred_char])

submit_pred = mnb.predict(pred_text)
#result = labelencoder.inverse_transform(submit_pred)

df_test['id'] = df_test['tweet_id']
df_test['emotion'] = submit_pred

df_test[['id', 'emotion']].to_csv('submission/TFIDF_logistic.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [50]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

api.competition_submit('submission/TFIDF_logistic.csv', 'API Submission', 'dm2020-hw2-nthu')



100%|██████████| 6.13M/6.13M [00:05<00:00, 1.11MB/s]


Successfully submitted to DM20 HW2 Kaggle Competition