In [1]:
# Empezar modelo NLP - Machine Learning

#### Libraries

In [263]:
import pandas as pd
import numpy as np
import re
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from nltk.classify import ClassifierI
from statistics import mode

#### Datasets

In [104]:
phrases1 = pd.read_csv("../Dataset/Suicidal phrases/dataset2.csv")
phrases = pd.read_csv("../Dataset/Suicidal phrases/dataset1.csv") #no hay NaN. Target 0: depresión. 1: bien. 11434 rows
phrases3 = pd.read_csv("../Dataset/Suicidal phrases/dataset3.csv")

In [181]:
phrases

Unnamed: 0,Id,Text,Target
0,289612631038361600,Argh.. I hate my life,0
1,289612727654170624,I'm good,1
2,289612736063758337,Enjoy my life,1
3,289612773716008960,RT : I do what I want,1
4,289612819807211520,My life is just a series of unfortunate fucked...,0
...,...,...,...
11429,285626150695337984,ajhg;dfakjh;ajh;a i'm so disappointed in mysel...,0
11430,285626247164342272,RT : I suck at life,0
11431,285626272330178560,My Life Is The Shit I Guess That This Was Mean...,0
11432,285626292177612801,I hate myself,0


In [109]:
# Let's create the DataFrame

phrases_t = phrases['Text']
phrases_t

0                                    Argh.. I hate my life
1                                                 I'm good
2                                            Enjoy my life
3                                    RT : I do what I want
4        My life is just a series of unfortunate fucked...
                               ...                        
11429    ajhg;dfakjh;ajh;a i'm so disappointed in mysel...
11430                                  RT : I suck at life
11431    My Life Is The Shit I Guess That This Was Mean...
11432                                        I hate myself
11433                                    I feel depressed,
Name: Text, Length: 11434, dtype: object

#### Cleaning Dataset and rows

In [110]:
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", " ", tweet) #add space placeholder

def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [111]:
phrases_text = phrases_t.apply(clean_up)
phrases_text

0                                    argh   i hate my life
1                                                 i m good
2                                            enjoy my life
3                                    rt   i do what i want
4        my life is just a series of unfortunate fucked...
                               ...                        
11429    ajhg dfakjh ajh a i m so disappointed in mysel...
11430                                  rt   i suck at life
11431    my life is the shit i guess that this was mean...
11432                                        i hate myself
11433                                     i feel depressed
Name: Text, Length: 11434, dtype: object

#### Tokenize & Lemmatize (lemma to do later)

In [117]:
tokenized = phrases_text.apply(word_tokenize)

In [131]:
# porter_text = tokenized.apply(PorterStemmer.stem(tokenized))

# lemmatizer = tokenized.apply(WordNetLemmatizer) 

#### Remove stopwords

In [90]:
#And we are going to create a function that removes stopwords

stopwords_list = set(stopwords.words('english'))

def remove_stopwords(tweet):
    return [x for x in tweet if x not in stopwords_list]

In [132]:
f = tokenized.apply(remove_stopwords)

In [133]:
f

0                                       [argh, hate, life]
1                                                   [good]
2                                            [enjoy, life]
3                                               [rt, want]
4        [life, series, unfortunate, fucked, events, ma...
                               ...                        
11429               [ajhg, dfakjh, ajh, disappointed, orz]
11430                                     [rt, suck, life]
11431                           [life, shit, guess, meant]
11432                                               [hate]
11433                                    [feel, depressed]
Name: Text, Length: 11434, dtype: object

In [134]:
# compiling feature lists of words from positive reviews and words from the negative tweets 
#to hopefully see trends in specific types of words in positive or negative tweets.

all_words = []
for index, value in f.iteritems():
    if value not in all_words:
        all_words += value

word_features = [x[0] for x in nltk.FreqDist(all_words).most_common(3000)]

In [135]:
word_features

['life',
 'rt',
 'good',
 'happy',
 'love',
 'hate',
 'want',
 'god',
 'blessed',
 'feel',
 'fuck',
 'sad',
 'blessing',
 'thanks',
 'deserve',
 'best',
 'great',
 'everything',
 'bad',
 'like',
 'friends',
 'ever',
 'really',
 'fine',
 'shit',
 'get',
 'know',
 'boring',
 'worst',
 'fucking',
 'much',
 'lol',
 'always',
 'tripping',
 'need',
 'proud',
 'awesome',
 'meekmill',
 'person',
 'im',
 'sucks',
 'never',
 'perfect',
 'got',
 'girl',
 'people',
 'think',
 'thank',
 'stressed',
 'change',
 'use',
 'man',
 'depressed',
 'live',
 'one',
 'ugly',
 'things',
 'feeling',
 'beautiful',
 'amazing',
 'die',
 'thankful',
 'suck',
 'nothing',
 'damn',
 'mood',
 'pretty',
 'dont',
 'could',
 'time',
 'way',
 'world',
 'omg',
 'reiatable',
 'complete',
 'wish',
 'yeah',
 'true',
 'oh',
 'day',
 'haha',
 'yes',
 'still',
 'make',
 'headache',
 'cause',
 'say',
 'loving',
 'living',
 'right',
 'swear',
 'single',
 'truly',
 'everyone',
 'thing',
 'every',
 'sick',
 'actually',
 'even',
 'hon

#### Find features

In [136]:
# Next, we're going to build a quick function that will find these top 3,000 words 
# in our positive and negative documents, marking their presence as either positive or negative:

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [237]:
# we can do this for all of our documents, saving the feature existence booleans 
# and their respective positive or negative categories by doing:

features_f = f.apply(find_features)

# Convert the list of dictionaries to dataframe: 

features_new = pd.DataFrame.from_dict(list(features_f), orient="columns")

# Create the column Target in the new dataframe:

features_new['Target'] = phrases['Target']

In [238]:
features_new

Unnamed: 0,life,rt,good,happy,love,hate,want,god,blessed,feel,...,cain,nooo,wio,hunna,contradict,elf,tonight,contemplate,linddsey,Target
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3,False,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11429,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
11430,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
11431,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
11432,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0


#### Data splitting

In [239]:
# Let's split the x and y. Also the training and testing set:

X = features_new.drop(columns="Target")
y = features_new["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

#### Linear Regression

In [240]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_train, y_train)

0.8302320697200938

#### Random Forest

In [241]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [246]:
rf.score(X_train, y_train)

0.9399490916208144

#### Random Forest Classifier

In [260]:
from sklearn.ensemble import RandomForestClassifier

rfclass = RandomForestClassifier(n_estimators=1000, random_state=42)
rfclass.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=42)

In [261]:
rfclass.score(X_train, y_train)

0.9844757844101891

In [270]:
# Save Random Forest Classifier in pickle:

rfclass_model = 'rfclass_model.sav'
pickle.dump(rfclass, open(rfclass_model, 'wb'))

In [272]:
# Load the model:

loaded_rfclass = pickle.load(open(rfclass_model, 'rb'))
result_rfclass = loaded_rfclass.score(X_test, y_test)
print(result_rfclass)

0.9379099256668124


#### KNN Classifier

In [249]:
n_neighbors = 7

knn = KNeighborsClassifier(n_neighbors)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.93
Accuracy of K-NN classifier on test set: 0.91


In [271]:
# Save the pickle knn:

knn_model = 'knn_model.sav'
pickle.dump(knn, open(knn_model, 'wb'))

In [273]:
loaded_knn = pickle.load(open(knn_model, 'rb'))
result_knn = loaded_knn.score(X_test, y_test)
print(result_knn)

0.9134236991692173


In [None]:
------------------------

#### Creating a module for Sentiment Analysis with NLTK

In [274]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [275]:
voted_classifier = VoteClassifier(result_rfclass, result_knn)

In [276]:
"""
KNN Classifier guardarlo en pickle y cargarlo.
transformar texto de la misma forma que los del resto del corpus. funcion process.

def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

"""


def sentiment(text):
    #hacer preprocessing aqui dentro
    clean_text = word_tokenize(text.lower())
    feats = find_features(clean_text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [277]:
import sentiment_mod as s

ModuleNotFoundError: No module named 'sentiment_mod'

In [279]:
# With that, we can now use this file, and the sentiment function as a module. 
# Here's an example script that might utilize the module:


print(sentiment("My life's horrible"))

AttributeError: 'numpy.float64' object has no attribute 'classify'