In [116]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [246]:
df = pd.read_csv('../data/linkfree_combined.csv')

In [247]:
df.head(2)

Unnamed: 0,type,linkfree_combined
0,INFJ,enfp intj moment sportscenter top ten play pra...
1,ENTP,im finding lack post alarming|||sex boring pos...


In [251]:
df = df.dropna()

In [258]:
def tokenize_for_countvec(post):
    return post.replace('|||', ' ')

In [259]:
le = LabelEncoder()
X = df.linkfree_combined.apply(tokenize_for_countvec)
y = df.type
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=822)

In [261]:
vectorizer = CountVectorizer(min_df=2)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [262]:
X_train.shape, X_train_vect.shape

((6071,), (6071, 40948))

In [263]:
X_train_vect_arr = X_train_vect.toarray()
X_test_vect_arr = X_test_vect.toarray()
X_train_vect_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [264]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_vect_arr, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=1000)

In [266]:
pickle.dump(log_model, open('models/not_converged_basic_log_model', 'wb'))

In [267]:
results = log_model.score(X_test_vect_arr, y_test)
print(result)

0.6050710718401844


In [268]:
random_test = ['hello blah blah blah', 'i love noodles', 'noodles are me', 'i think the world is a beautiful place', 'sometimes i like to go shopping', 'the things i do for love', 'man i love that movie', 'sometimes the world is a beautiful place but somtimes i hate it so much', 'wow that waas such a good movie, deadpool was amazing. highly recommended']

In [269]:
Austin_test = ['jeez she def knows how to sound as bitchy as possible','when can you finally never talk to her ever again?','sounds very natural to me','what a cunt','looking forward to that day for you','finally managed to get rid of my slicey driver','can you come over real quick',"need help grabbing mel's bag",'she gonna meet us there','cool cool','hey! yeah i let you know a week or so ago that i was gonna miss today','just a small family trip. be back in class monday',"i'll watch the lecture vid and work on the challanges myself"]

In [270]:
def preprocess(post_split, get_youtube=False, add_description=False):
    #replace youtube links with youtube title
    #return list of 50 posts
    if get_youtube:
        post_split = replace_youtube(post_split, add_description)
    
    #removes any 'words' that have http:// or https:// in them
    #returns a list of posts if they are not empty after removing the links
    #return list of <= 50 posts
    post_split = remove_links(post_split)
    
    remove_punc = string.punctuation + '►•'
    #remove punc and lower
    for punctuation in remove_punc:
        for i, item in enumerate(post_split):
            post_split[i] = item.replace(punctuation, '').lower()
            
    #remove soft hyphens       
    for i, item in enumerate(post_split):
        post_split[i] = item.replace('\xad', '').lower()
        
    #remove numbers
    for i, item in enumerate(post_split):
        post_split[i] = ''.join(word for word in item if not word.isdigit())
    
    #remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(item) for item in post_split]
    post_split = [[word for word in sentence if word not in stop_words] for sentence in word_tokens]
    
    #lemmatize if not empty sentence
    lemmatizer = WordNetLemmatizer()
    post_split_split = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in post_split if sentence]
    
    #combine text into one string where each word is seperated by a space and each sentence separated by |||
    return combine_text(post_split_split)

In [271]:
def remove_links(post_split):
    # split each post into a list of individual words
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [272]:
def combine_text(post_split_split):
    #takes in a list of sentences where each sentence is a list of its words
    #and returns the one string where each word is seperated by a space and each sentence separated by |||
    return '|||'.join([' '.join(sentence) for sentence in post_split_split])

In [273]:
Austin_test_cleaned = preprocess(Austin_test)
random_test_cleaned = preprocess(random_test)

In [274]:
Austin_test_cleaned

'jeez def know sound bitchy possible|||finally never talk ever|||sound natural|||cunt|||looking forward day|||finally managed get rid slicey driver|||come real quick|||need help grabbing mels bag|||gon na meet u|||cool cool|||hey yeah let know week ago gon na miss today|||small family trip back class monday|||ill watch lecture vid work challanges'

In [283]:
Austin_test_cleaned = [tokenize_for_countvec(Austin_test_cleaned)]
random_test_cleaned = [tokenize_for_countvec(random_test_cleaned)]

In [284]:
Austin_test_cleaned

['jeez def know sound bitchy possible finally never talk ever sound natural cunt looking forward day finally managed get rid slicey driver come real quick need help grabbing mels bag gon na meet u cool cool hey yeah let know week ago gon na miss today small family trip back class monday ill watch lecture vid work challanges']

In [285]:
Austin_test_vect = vectorizer.transform(Austin_test_cleaned)
random_test_vect = vectorizer.transform(random_test_cleaned)

In [287]:
log_model.predict(Austin_test_vect)

array([15])

In [288]:
le.inverse_transform(log_model.predict(Austin_test_vect))

array(['ISTP'], dtype=object)

In [289]:
log_model.predict(random_test_vect)

array([9])

In [290]:
le.inverse_transform(log_model.predict(random_test_vect))

array(['INFP'], dtype=object)