In [1]:
# Required libraries.
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import spacy
import string
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
from spacy import tokenizer
from spacy.lang.en import English
nlp = English()
tkz = tokenizer.Tokenizer(nlp.vocab)
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import re
from bs4 import BeautifulSoup
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import unicodedata
import pickle

In [2]:
train_clean = pd.read_pickle('/Users/simonefacchiano/Desktop/Data Science/SL/Project/train_clean_2.pkl')

train = pd.read_json('/Users/simonefacchiano/Desktop/Data Science/SL/Project/SL-Final-Project/train.json')

In [3]:
train['age_class'] = pd.cut(
        train["age"],
        bins=[12, 18, 28, 50],
        labels=[0, 1, 2]
    ).astype("int")

# Possiamo quindi rimuovere la variabile age originale
train = train.drop('age', axis = 1)

In [4]:
train_clean['age_class'] = pd.cut(
        train_clean["age"],
        bins=[12, 18, 28, 50],
        labels=[0, 1, 2]
    ).astype("int")

# Possiamo quindi rimuovere la variabile age originale
train = train_clean.drop('age', axis = 1)

In [5]:
train['word_count'] = train['post'].apply(lambda x: len(str(x).split()))

In [6]:
train.insert(1, 'clean_post', train_clean.post)

# Intuizione

L'idea è la seguente. Noi per il momento abbiamo solo considerato il contenuto delle frasi. Ci siamo riusciti facendo un pre-processing bello corposo, e poi facendo l'embedding dei post per avere una rappresentazione numerica, diventata l'unica feature all'interno del nostro primissimo modello logistico (0.64 di accuracy).

Tuttavia, non stiamo considerando altri fattori. Ad esempio, potrebbero esserci indicazioni importanti dalla punteggiatura, dalle emoticon, dai link o dell'utilizzo di lettere maiuscole e minuscole, oltre ovviamente alla lunghezza dei post.
Per questo motivo, proviamo ora a vedere come si distribuiscono questi elementi tra i due generi.

In un secondo momento, ripeteremo questa stessa analisi anche per le classi di età.

### Emoticon

In [7]:
train[train['post'].str.contains(":\)", regex=True)].gender.value_counts()

# 12mila vs 7mila... e non è l'unico esempio

Series([], Name: gender, dtype: int64)

In [8]:
# Creiamo una lista di possibili emoticon:
emoticons = [r':\)', r':-\)', r':\(', r':-\(', r';\)', r';-\)', r':D', r':-D', r':P', r':-P', r':O', r':-O', r':\|', r':-\|', r'>:\(', r":'\(", r":'-\(", r'XD', r'<3', r':3', r'>:-0']

# Cerchiamo:
len( train[train['post'].str.contains('|'.join(emoticons), regex=True)] )
# 55mila righe... più del 10% dell'intero dataset contiene queste emoticon

0

In [9]:
# Le emoticon possono discriminare il genere...
train[train['post'].str.contains('|'.join(emoticons), regex=True)].gender.value_counts() # c'è una evidente differenza

Series([], Name: gender, dtype: int64)

In [10]:
# ... ma soprattutto possono discriminare l'età
train[train['post'].str.contains('|'.join(emoticons), regex=True)].age_class.value_counts()

Series([], Name: age_class, dtype: int64)

In [11]:
# creiamo la variabile "has_emoticon"
train['has_emoticon'] = train['post'].str.contains('|'.join(emoticons), regex=True).astype(int)

In [12]:
# C'è però un problema da tenere in considerazione: 
# è vero, a usare le emoticon sono soprattuto i giovani... ma poiché ci sono meno persone che della seconda classe d'età abbiamo che:
train[train.has_emoticon == 0].age_class.value_counts()

# A *non* usare le emoticon sono nuovamente i giovani!
# Dovremo risolvere questo inconveniente usando dei pesi

1    239520
0    169678
2     94699
Name: age_class, dtype: int64

### Punteggiatura

In [13]:
import re

characters = ['...', '!!', '??', '?!']
pattern = '|'.join([re.escape(char) for char in characters])

len(train[train['post'].str.contains(pattern)])

0

Forse non cambia troppo con il genere...

In [14]:
train[train['post'].str.contains(pattern)].gender.value_counts()

Series([], Name: gender, dtype: int64)

Ma con l'età scopriamo una cosa interessante...

In [15]:
train[train['post'].str.contains(pattern)].age_class.value_counts()

Series([], Name: age_class, dtype: int64)

In [16]:
train['has_punctuation'] = train['post'].str.contains(pattern).astype(int)

# Ma di nuovo, serviranno probabilmente dei pesi

### Vocabolario e link

In [17]:
# L'idea è quella di sfruttare la presenza di parole che siano "specifiche" di una certa classe (di età o di genere).
# Per farlo usiamo la seguente funzione:

from nltk.probability import FreqDist

def most_common_words(dataset):
    #stop = set(stopwords.words("english"))
    corpus = [word for i in dataset["clean_post"].str.split().values.tolist() for word in i] # if (word not in stop)
    most_common = FreqDist(corpus).most_common()
    words, frequency = [], []
    for word, count in most_common:
        words.append(word)
        frequency.append(count)   
    return dict(zip(words, frequency))

In [18]:
### Con queste ci ho provato ma non migliorano il modello

# Estraiamo poi le parole più comuni per ogni gruppo

girl_dict = most_common_words(train[train.gender == 'female'])
boy_dict = most_common_words(train[train.gender == 'male'])

young_dict = most_common_words(train[train.age_class == 0])
medium_dict = most_common_words(train[train.age_class == 1])
old_dict = most_common_words(train[train.age_class == 2])

In [19]:
### Ricopio quello che dice il paper

girl_words = [word for word, freq in girl_dict.items() if freq > 5000]
boy_words = [word for word, freq in boy_dict.items() if freq > 5000]

young_words = [word for word, freq in young_dict.items() if freq > 5000]
medium_words = [word for word, freq in medium_dict.items() if freq > 5000]
old_words = [word for word, freq in old_dict.items() if freq > 5000]

In [20]:
# E infine facciamo una differenza tra i set di parole, in modo da ottenere parole molto "specifiche" per un certo sottogruppo di autori

# Ma anche questo funzionava poco. Seguiamo cosa dice il paper

only_girls = list(set(girl_words) - set(boy_words))
only_boys = list(set(boy_words) - set(girl_words))

only_young = list(set(young_words) - set(medium_words) - set(old_words))
only_medium = list(set(medium_words) - set(young_words) - set(old_words))
only_old = list(set(old_words) - set(medium_words) - set(young_words))

In [22]:
# Il paper propone la seguente cosa:

# si prendono le parole che hanno una frequenza > 5000
# si vedono quelle parole che hanno una differenza di frequenza maggiore tra i vari gruppi
# e si considerano quelle come feature

# Ad esempio, la parola "football" comparirà con una frequenza molto grande per i maschi e molto piccola per le femmine, quindi la parola "football" è in qualche modo discriminante.
# Proviamo a farlo

#                  ----> #### GENDER #### <----

dict_boys_vs_girls = dict()

for word in boy_words:
    freq_m = boy_dict[word]

    try:
        freq_f = girl_dict[word]
    except KeyError:
        freq_f = 0

    dict_boys_vs_girls[word] = freq_m - freq_f # se > 0 compare più nei maschi, se < 0 compare di più nelle femmine

In [23]:
sorted(dict_boys_vs_girls.items(), key=lambda x: x[1], reverse=True)

[('game', 12127),
 ('bush', 11679),
 ('war', 9728),
 ('post', 9506),
 ('site', 7111),
 ('government', 6777),
 ('president', 6465),
 ('blog', 6404),
 ('news', 6385),
 ('use', 6371),
 ('iraq', 6147),
 ('american', 6087),
 ('system', 5898),
 ('kerry', 5886),
 ('play', 5834),
 ('games', 5694),
 ('team', 5687),
 ('com', 5664),
 ('america', 5180),
 ('country', 5016),
 ('state', 4909),
 ('web', 4819),
 ('states', 4819),
 ('point', 4683),
 ('music', 4661),
 ('information', 4637),
 ('free', 4606),
 ('article', 4454),
 ('film', 4351),
 ('gay', 4338),
 ('united', 4155),
 ('playing', 4152),
 ('played', 4107),
 ('political', 4025),
 ('media', 3931),
 ('law', 3896),
 ('win', 3855),
 ('power', 3769),
 ('s', 3755),
 ('movie', 3721),
 ('problem', 3690),
 ('based', 3678),
 ('national', 3676),
 ('public', 3590),
 ('internet', 3370),
 ('business', 3252),
 ('video', 3233),
 ('page', 3226),
 ('support', 3210),
 ('al', 3198),
 ('link', 3187),
 ('john', 3176),
 ('album', 3171),
 ('case', 3141),
 ('question', 

In [24]:
# Stessa cosa per l'età!
from collections import Counter

# Lo dividiamo in 3 parti:
# young vs rest
# medium vs rest
# old vs rest

dict_young_vs_rest = dict()
dict_medium_and_old = Counter(medium_dict) + Counter(old_dict)

for word in young_words:
    freq_y = young_dict[word]

    try:
        freq_r = dict_medium_and_old[word]
    except KeyError:
        freq_r = 0

    dict_young_vs_rest[word] = freq_y - freq_r # se > 0 compare più nei giovani, se < 0 compare di più nelle altre classi

########################

dict_medium_vs_rest = dict()
dict_young_and_old = Counter(young_dict) + Counter(old_dict)

for word in medium_words:
    freq_med = medium_dict[word]

    try:
        freq_re = dict_young_and_old[word]
    except KeyError:
        freq_re = 0

    dict_medium_vs_rest[word] = freq_med - freq_re # se > 0 compare più nei medi, se < 0 compare di più nelle altre classi


########################

dict_old_vs_rest = dict()
dict_young_and_medium = Counter(young_dict) + Counter(medium_dict)

for word in old_words:
    freq_o = old_dict[word]

    try:
        freq_rest = dict_young_and_medium[word]
    except KeyError:
        freq_rest = 0

    dict_old_vs_rest[word] = freq_o - freq_rest # se > 0 compare più nei vecchi, se < 0 compare di più nelle altre classi

In [28]:
sorted(dict_old_vs_rest.items(), key=lambda x: x[1], reverse=True)

[('iraq', -1631),
 ('information', -3557),
 ('president', -3977),
 ('government', -4689),
 ('children', -5723),
 ('company', -6843),
 ('bush', -7615),
 ('books', -7741),
 ('com', -8182),
 ('power', -8185),
 ('church', -8279),
 ('state', -8395),
 ('war', -8512),
 ('american', -8563),
 ('women', -8705),
 ('john', -8786),
 ('woman', -9115),
 ('office', -9919),
 ('country', -9938),
 ('body', -10329),
 ('mother', -10779),
 ('men', -10825),
 ('case', -11347),
 ('line', -11745),
 ('number', -11839),
 ('living', -11934),
 ('site', -12047),
 ('city', -12454),
 ('white', -13117),
 ('months', -13552),
 ('water', -13555),
 ('small', -13592),
 ('group', -13817),
 ('news', -13963),
 ('kids', -14042),
 ('open', -14077),
 ('problem', -14132),
 ('set', -14383),
 ('black', -14710),
 ('early', -14720),
 ('looked', -15006),
 ('check', -15193),
 ('heard', -15477),
 ('taking', -15681),
 ('word', -15786),
 ('gave', -16066),
 ('weeks', -16198),
 ('hand', -16546),
 ('couple', -16746),
 ('true', -16771),
 ('wor

In [60]:
boys_best = sorted(dict_boys_vs_girls.items(), key=lambda x: x[1], reverse=True)[:150]
girls_best = sorted(dict_boys_vs_girls.items(), key=lambda x: x[1], reverse=False)[:150]

young_best = sorted(dict_young_vs_rest.items(), key=lambda x: x[1], reverse=True)[:150]
medium_best = sorted(dict_medium_vs_rest.items(), key=lambda x: x[1], reverse=True)[:150]
old_best = sorted(dict_old_vs_rest.items(), key=lambda x: x[1], reverse=True)[:150]

# In particolare per i giovani, durante il preprocessing abbiamo trovato altre parole caratteristiche:

other_young_words = [' bf ', ' gf ', 'bday', 'peeps', 'b/c', ' cos ',
               ' jk ', 'wtf', 'WTF', 'omg', 'OMG', 'lol', 'omfg', 'OMFG', 'XD', 'wuts',
                ' cuz ', ' ppl ', 'cant', 'nvr', ' enuf ', ' u ', ' n ', ' r ', ' luv ', 'haha', 'yeah', 'oh']

parole_discriminanti = list(set([tuple[0] for tuple in boys_best + girls_best + young_best + other_young_words + medium_best + old_best]))

In [None]:
parole_discriminanti

In [62]:
# Ora la parte più difficile. Per ognuna di queste parole, dobbiamo creare una variabile 0/1, che ci dice se quella parola è presente o meno nel testo

def words_variables(post):
    var_dict = {}
    for parola in parole_discriminanti:
        nome_variabile = f'has_{parola}'
        var_dict[nome_variabile] = 1 if parola in post else 0
    return pd.Series(var_dict)

train_parole = train['post'].apply(words_variables)
train = pd.concat([train, train_parole], axis = 1)


In [None]:
# E a questo punto non resta altro se non creare la nuova variabile:

# train['girl_word'] = train['post'].apply(lambda x: int(any(word in x for word in only_girls)))
# train['boy_word'] = train['post'].apply(lambda x: int(any(word in x for word in only_boys)))

# train['young_word'] = train['post'].apply(lambda x: int(any(word in x for word in only_young)))
# train['medium_word'] = train['post'].apply(lambda x: int(any(word in x for word in only_medium)))
# train['old_word'] = train['post'].apply(lambda x: int(any(word in x for word in only_old)))

# # Factorize
# train['girl_word'] = pd.factorize(train.girl_word)[0]
# train['boy_word'] = pd.factorize(train.boy_word)[0]

# train['young_word'] = pd.factorize(train.young_word)[0]
# train['medium_word'] = pd.factorize(train.medium_word)[0]
# train['old_word'] = pd.factorize(train.old_word)[0]

In [None]:
train

Unnamed: 0,post,clean_post,gender,age_class,word_count,has_emoticon,has_punctuation,has_toy,has_theloboy,has_match,...,has_start,has_pepper,has_speech,has_couple,has_kim,has_train,has_critical,has_shit,has_taskforce,has_remain
0,ooh shiny commenting,ooh shiny commenting,female,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,wuts parade suked band battle kicked ass jims ...,wuts parade suked band battle kicked ass jims ...,male,0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,anymore concerned everyday bold faced liar ahe...,anymore concerned everyday bold faced liar ahe...,female,1,25,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,roof sunset posted paul,roof sunset posted paul,male,1,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,gawd luv nanny absolutely greatest woman earth...,gawd luv nanny absolutely greatest woman earth...,female,1,204,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034,thursday jun p m forgotten conversations lunch...,thursday jun p m forgotten conversations lunch...,female,1,30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1035,cause grief trying hard told far kept trying f...,cause grief trying hard told far kept trying f...,female,0,63,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1036,talking hear whispering ultimate hubris believ...,talking hear whispering ultimate hubris believ...,female,2,30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1037,yeah heard phantom planet alright heard song c...,yeah heard phantom planet alright heard song c...,male,1,32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Passiamo ora a vedere la presenza di URL

In [None]:
train[train.post.str.contains('urlLink')].age_class.value_counts()

# sembra che i giovanissimi ne usino di meno...

Series([], Name: age_class, dtype: int64)

In [63]:
train['has_URL'] = train['post'].str.contains('urlLink').astype(int)

### Lunghezza del post

In [None]:
train.groupby('age_class')['word_count'].mean()

age_class
0    66.029674
1    78.481799
2    87.673469
Name: word_count, dtype: float64

### Sentence count, average word length e average sentence length (NON DISCRIMINANO)

In [None]:
# # Sentence counts
# train["sent_count"] = train["post"].map(lambda x: len(sent_tokenize(x)))
# # Average word length
# train["avg_word_len"] = train["post"].map(lambda x: np.mean([len(w) for w in str(x).split()])).fillna(0)
# # Average sentence length
# train["avg_sent_len"] = train["post"].map(lambda x: np.mean([len(w.split()) for w in sent_tokenize(x)])).fillna(0)

In [None]:
# train.groupby('age_class')['sent_count'].mean()

### Part of Speech (POS)

In [5]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [6]:
pos_list = []

sentence = "I was running down the street when this happened"
for token in nlp(sentence):
    pos_list.append(token.tag_)

pos_list

['PRP', 'VBD', 'VBG', 'IN', 'DT', 'NN', 'WRB', 'DT', 'VBD']

In [7]:
from collections import Counter

Counter(pos_list)

Counter({'PRP': 1, 'VBD': 2, 'VBG': 1, 'IN': 1, 'DT': 2, 'NN': 1, 'WRB': 1})

In [9]:
def pos_counter(text):
    pos_list = []
    for token in nlp(text):
        pos_list.append(token.tag_)

    return dict(Counter(pos_list))

def pos_variables(text):
    
    verbs = pos_counter(text).get('VBD', 0) + pos_counter(text).get('VBG', 0) + pos_counter(text).get('VB', 0) + pos_counter(text).get('VBP', 0) + pos_counter(text).get('VBN', 0) + pos_counter(text).get('VBZ', 0)
    prepositions = pos_counter(text).get('PRP', 0)
    articles = pos_counter(text).get('DT', 0)

    return verbs, prepositions, articles

In [10]:
train['pos'] = train['post'].apply(pos_variables)

train[['verbs', 'prepositions', 'articles']] = train['pos'].str.extract(r'\((\d+), (\d+), (\d+)\)')
train[['verbs', 'prepositions', 'articles']] = train[['verbs', 'prepositions', 'articles']].astype(int)

KeyboardInterrupt: 

### Recap

In [None]:
#train['gender'] = train['gender'].map({'male': 1, 'female': 0})

train.columns

In [73]:
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [74]:
save_object(train, 'train_features_paper.pkl')

In [86]:
train.columns.values

array(['post', 'clean_post', 'gender', 'age_class', 'word_count',
       'has_emoticon', 'has_punctuation', 'has_believe', 'has_version',
       'has_com', 'has_woke', 'has_lunch', 'has_spent', 'has_power',
       'has_friends', 'has_ago', 'has_stupid', 'has_watch', 'has_living',
       'has_found', 'has_cuz', 'has_trying', 'has_pay', 'has_im',
       'has_ah', 'has_ya', 'has_seen', 'has_college', 'has_americans',
       'has_interesting', 'has_plus', 'has_michael', 'has_c', 'has_story',
       'has_beer', 'has_camp', 'has_O', 'has_room', 'has_online',
       'has_john', 'has_site', 'has_kept', 'has_place', 'has_de',
       'has_blue', 'has_human', 'has_face', 'has_finally', 'has_told',
       'has_mad', 'has_hopefully', 'has_hand', 'has_morning', 'has_bed',
       'has_play', 'has_small', 'has_care', 'has_drinking', 'has_america',
       'has_works', 'has_early', 'has_dad', 'has_beautiful', 'has_school',
       'has_soon', 'has_music', 'has_crap', 'has_ha', 'has_funny',
       'has_us

# Test

#### We do all the previous steps again, this time on the test set

In [64]:
test = pd.read_json('/Users/simonefacchiano/Desktop/Data Science/SL/Project/SL-Final-Project/test.json')
test_clean = pd.read_pickle('/Users/simonefacchiano/Desktop/Data Science/SL/Project/train_clean.pkl')

In [65]:
test.insert(1, 'clean_post', test_clean.post)

#### Word Count

In [66]:
test['age_class'] = pd.cut(
        test["age"],
        bins=[12, 18, 28, 50],
        labels=[0, 1, 2]
    ).astype("int")

test = test.drop('age', axis = 1)


test_clean['age_class'] = pd.cut(
        test_clean["age"],
        bins=[12, 18, 28, 50],
        labels=[0, 1, 2]
    ).astype("int")

test_clean = test_clean.drop('age', axis = 1)

test['word_count'] = test['post'].apply(lambda x: len(str(x).split()))

#### Emoticon

In [67]:
emoticons = [r':\)', r':-\)', r':\(', r':-\(', r';\)', r';-\)', r':D', r':-D', r':P', r':-P', r':O', r':-O', r':\|', r':-\|', r'>:\(', r":'\(", r":'-\(", r'XD', r'<3', r':3', r'>:-0']

test['has_emoticon'] = test['post'].str.contains('|'.join(emoticons), regex=True).astype(int)

#### Punctuation

In [68]:
import re

characters = ['...', '!!', '??', '?!']
pattern = '|'.join([re.escape(char) for char in characters])

test['has_punctuation'] = test['post'].str.contains(pattern).astype(int)

#### Vocabulary & discriminative words

In [69]:
# Qui riprendiamo le parole discriminanti che abbiamo estratto dal train nellle celle sopra
parole_discriminanti

['hope',
 'hehe',
 'weeks',
 'york',
 'sister',
 'friend',
 'members',
 'anymore',
 'die',
 'click',
 'doesnt',
 'totally',
 'homework',
 'plans',
 'past',
 'asked',
 'sort',
 'cd',
 'game',
 'lol',
 'military',
 'fast',
 'r',
 'afternoon',
 'boy',
 'w',
 'places',
 'sad',
 'thinking',
 'completely',
 'thank',
 'ha',
 'group',
 'support',
 'pissed',
 'comes',
 'knew',
 'seriously',
 'weird',
 'list',
 'drinking',
 'love',
 'main',
 'type',
 'shall',
 'enjoy',
 'shows',
 'sex',
 'friends',
 'americans',
 'place',
 'wonder',
 'force',
 'care',
 'wat',
 'ill',
 'fact',
 'future',
 'yeah',
 'makes',
 'alot',
 'bed',
 'soon',
 'thats',
 'sorry',
 'deal',
 'company',
 'public',
 'excited',
 'trying',
 'moving',
 'question',
 'finally',
 'supposed',
 'paul',
 'pay',
 'summer',
 'listen',
 'record',
 'problem',
 'bye',
 'weekend',
 'sit',
 'club',
 'cool',
 'rock',
 'entire',
 'state',
 'birthday',
 'title',
 'money',
 'mad',
 'saying',
 'yay',
 'mood',
 'teacher',
 'mom',
 'truly',
 'second',

In [70]:
def words_variables(post):
    var_dict = {}
    for parola in parole_discriminanti:
        nome_variabile = f'has_{parola}'
        var_dict[nome_variabile] = 1 if parola in post else 0
    return pd.Series(var_dict)

test_parole = test['post'].apply(words_variables)
test = pd.concat([test, test_parole], axis = 1)

In [33]:
test

Unnamed: 0,post,clean_post,gender,age_class,word_count,has_emoticon,has_punctuation,has_hope,has_hehe,has_weeks,...,has_men,has_taking,has_bill,has_hey,has_saturday,has_remember,has_party,has_walking,has_share,has_brother
0,Thabo admits defeat on quiet diplomacy Mbeki ...,ooh shiny new commenting,male,1,185,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,Brainbench welcomes its 5 millionth subscriber...,today parade suked wasnt bad band year battle ...,male,1,145,0,1,0,0,0,...,1,1,0,1,0,0,0,0,1,0
2,"Even though the air in Jerusalem is dry, it is...",know anymore concerned everyday want bold face...,female,1,45,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,there's nothing else more embarassing in life ...,roof sunset posted paul,female,1,158,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Today I had a glass artist over for a firing. ...,god love nanny absolutely greatest woman earth...,female,2,208,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131698,em- i hope you go for the other job. it sound...,cali ok ti ame missed stevi day got wasn t abl...,female,1,24,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
131699,"BLOOD PRESSURE IS REALLY REALLY HIGH, I HAD TO...",conference building,female,1,57,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131700,I was hoping for good news but there is no new...,dislaimer pictures shown taken heavy heart sto...,female,1,36,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131701,"After the slow week we have had, and a couple ...",patio garden,male,2,261,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


#### URL link

In [71]:
test['has_URL'] = test['post'].str.contains('urlLink').astype(int)

#### SAVE

In [72]:
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

save_object(test, 'test_features_paper.pkl')