# Goal is to answer the question: can a machine detect a person's gender based on their tweet?

Steps:
1. Get twitter data
2. Clean
3. Analyze and visualize
4. Build model
5. Test model

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import tweepy as tw
import seaborn as sns
import json
import pprint
from matplotlib import pyplot as plt

# Import libraries for WordCloud
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests

import spacy
import nltk
from nltk import PorterStemmer
import textblob
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Connect to twitter API
path_auth = '/Users/allenj/Documents/Keys/auth_twitter.json'
auth = json.loads(open(path_auth).read())
pp = pprint.PrettyPrinter(indent=4)

my_consumer_key = auth['my_consumer_key']
my_consumer_secret = auth['my_consumer_secret']
my_access_token = auth['your_access_token']
my_access_token_secret = auth['my_access_token_secret']

auth = tw.OAuthHandler(my_consumer_key, my_consumer_secret)
auth.set_access_token(my_access_token, my_access_token_secret)
api = tw.API(auth)

type(api)

tweepy.api.API

# 1. Get Twitter data

In [3]:
# Upload list of desired users
# Gender 0 = male, 1 = female
users = pd.read_csv('../Data/twitter-users.csv')
users

Unnamed: 0,user,name,gender,followers_millions,activity,country
0,aaker,Jennifer Aaker,1,-,-,-
1,AdamMGrant,Adam Grant,0,-,-,-
2,Adele,Adele,1,27,Musician,United Kingdom
3,akshaykumar,Akshay Kumar,0,36,Actor,India
4,aliciakeys,Alicia Keys,1,30,Musician,United States
...,...,...,...,...,...,...
69,taylorswift13,Taylor Swift,1,86,Musician,United States
70,TheEllenShow,Ellen DeGeneres,1,80,Comedian and television hostess,United States
71,UnhealthyTruth,Robyn O'Brien,1,-,-,-
72,VeronicaMcG,Veronica McGregor,1,-,-,-


In [4]:
# Get collection of tweets from these usernames and store it into a new dataframe
list = []

for index, row in users.iterrows():
    tweets = api.user_timeline(screen_name=row['user'], count=150, include_rts=False)
    users_text = [[tweet.user.screen_name, tweet.text, row['gender']] for tweet in tweets]
    tweet_text = pd.DataFrame(data=users_text, 
                        columns=["user", "text", "gender"])
    list.append(tweet_text)

# Merge the list    
tweets = pd.concat(list) 
tweets

Unnamed: 0,user,text,gender
0,aaker,@aunder @KatieS @sarahcpr Done! Also @3GS @di...,1
1,aaker,@KatieS @sarahcpr You two both joined Humor: S...,1
2,aaker,Today was the last day of class. Feeling nosta...,1
3,aaker,@karagoldin Your masks are OUTSTANDING. we lov...,1
4,aaker,Blown away by the comic SWAT team that joined ...,1
...,...,...,...
61,wizkhalifa,Oh yea roll somethin and get tha day started.,0
62,wizkhalifa,At tha rate we’re goin a lawyer gon take joe c...,0
63,wizkhalifa,Get that @McQueenVF #McQueenTeam https://t.co/...,0
64,wizkhalifa,Contact ft @Tyga out now @Spotify \n\nhttps://...,0


In [6]:
tweets['clean_text'] = tweets['text']
tweets

Unnamed: 0,user,text,gender,clean_text
0,aaker,@aunder @KatieS @sarahcpr Done! Also @3GS @di...,1,@aunder @KatieS @sarahcpr Done! Also @3GS @di...
1,aaker,@KatieS @sarahcpr You two both joined Humor: S...,1,@KatieS @sarahcpr You two both joined Humor: S...
2,aaker,Today was the last day of class. Feeling nosta...,1,Today was the last day of class. Feeling nosta...
3,aaker,@karagoldin Your masks are OUTSTANDING. we lov...,1,@karagoldin Your masks are OUTSTANDING. we lov...
4,aaker,Blown away by the comic SWAT team that joined ...,1,Blown away by the comic SWAT team that joined ...
...,...,...,...,...
61,wizkhalifa,Oh yea roll somethin and get tha day started.,0,Oh yea roll somethin and get tha day started.
62,wizkhalifa,At tha rate we’re goin a lawyer gon take joe c...,0,At tha rate we’re goin a lawyer gon take joe c...
63,wizkhalifa,Get that @McQueenVF #McQueenTeam https://t.co/...,0,Get that @McQueenVF #McQueenTeam https://t.co/...
64,wizkhalifa,Contact ft @Tyga out now @Spotify \n\nhttps://...,0,Contact ft @Tyga out now @Spotify \n\nhttps://...


In [None]:
# Count the number of datapoints per person
tweets.groupby(["user", "gender"]).size()

# 2. Clean text

### Remove things

In [None]:
# Clean text by removing things
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

#Remove @ symbol, URL links, and "&amp;"
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['text'], "@[\w]*") #removes all @
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "&amp;")
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "#[\w]*") #removes all hashtags
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "https?:\/\/.*[\r\n]*")
tweets

In [11]:
testset = pd.read_csv('../Data/twitter-testset.csv')
testset

Unnamed: 0.1,Unnamed: 0,user,text,gender,clean_text,length
0,0,BarackObama,My statement on the death of George Floyd: htt...,0,statement death George Floyd,66
1,1,BarackObama,"If you believe in a more just, more generous, ...",0,believe more just more generous more democrati...,140
2,2,BarackObama,"On Memorial Day, we honor those who gave all f...",0,Memorial honor those gave That takes different...,140
3,3,BarackObama,And here’s more on the approach Sweden has tak...,0,here more approach Sweden taken which differs ...,117
4,4,BarackObama,South Korea has focused on testing to guard ag...,0,South Korea focused testing guard against outb...,87


In [10]:
# Do the same for the testing dataset
# Clean text by removing things
testset = pd.read_csv('../Data/twitter-testset.csv')
testset['clean_text'] = np.vectorize(remove_pattern)(testset['text'], "@[\w]*") #removes all @
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "https?:\/\/.*[\r\n]*")
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "&amp;")
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "#[\w]*") #removes all hashtags
testset

NameError: name 'remove_pattern' is not defined

### Remove punctuations

In [None]:
# Remove punctuation, numbers, and special characters
tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z#]", " ")
tweets

In [None]:
# Do the same for the testing dataset
# Remove punctuation, numbers, and special characters
testset['clean_text'] = testset['clean_text'].str.replace("[^a-zA-Z#]", " ")
testset

### Remove stop words

In [None]:
# Remove short words less than 3
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
tweets.head(10)

In [None]:
# Do the same for the testing dataset
# Remove short words less than 2
testset['clean_text'] = testset['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
testset.head(10)

### Create new column to count length of clean text

In [None]:
# Count length of characters
tweets['length'] = tweets['clean_text'].apply(len)
tweets

In [None]:
# Do the same for the testing dataset
# Count length
testset['length'] = testset['clean_text'].apply(len)
testset

### Remove rows in training data that have less than desired text length

In [None]:
plt.hist(tweets['length'])
plt.show()

In [None]:
tweets['length'].describe()

In [None]:
# Remove rows where length <= 30
tweets = tweets[tweets.length > 30]
tweets

In [None]:
plt.hist(tweets['length'])
plt.show()

### Tokenize, stem, and stich back

In [None]:
# # Tokenization
# tokenized_tweet = tweets['clean_text'].apply(lambda x: x.split())
# tokenized_tweet.head()

In [None]:
# # Do the same for the testing dataset
# # Tokenization
# tokenized_testset = testset['clean_text'].apply(lambda x: x.split())
# tokenized_testset.head()

In [None]:
# # Stemming
# ps = PorterStemmer()
# tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])
# tokenized_tweet.head()

In [None]:
# # Do the same for the testing dataset
# # Stemming
# ps = PorterStemmer()
# tokenized_testset = tokenized_testset.apply(lambda x: [ps.stem(i) for i in x])
# tokenized_testset.head()

In [None]:
# # Stich tokens back together
# for i in range(len(tokenized_tweet)):
#     tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
          
# testset['clean_text'] = tokenized_tweet
# testset['clean_text']

In [None]:
# # Stich tokens back together
# for i in range(len(tokenized_tweet)):
#     tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
          
# testset['clean_text'] = tokenized_tweet
# testset['clean_text']

# 4. Model Selectioin and Machine Learning

### Bag of Words

In [7]:
# Bag-of-Words features
bow_vectorizer = CountVectorizer(stop_words='english')

# Bag-of-Words feature matrix
bow = bow_vectorizer.fit_transform(tweets['clean_text'])
df_bow = pd.DataFrame(bow.todense(), columns=bow_vectorizer.get_feature_names())
df_bow

Unnamed: 0,00,000,000th,00ano5u6jn,00pm,01,010101,01aqjixwaw,01h,02,...,ありがとうございます,おおきに,フレッシュネスバーガー,千葉,大阪,東京,神戸,𝓐𝓫𝓸𝓿𝓮,𝓗𝓮𝓪𝓭,𝓦𝓪𝓽𝓮𝓻
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8319,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Do the same for test dataset
# Bag-of-Words feature matrix
bow = bow_vectorizer.transform(testset['clean_text'])
df_bow_test = pd.DataFrame(bow.todense(), columns=bow_vectorizer.get_feature_names())
df_bow_test

Unnamed: 0,00,000,000th,00ano5u6jn,00pm,01,010101,01aqjixwaw,01h,02,...,ありがとうございます,おおきに,フレッシュネスバーガー,千葉,大阪,東京,神戸,𝓐𝓫𝓸𝓿𝓮,𝓗𝓮𝓪𝓭,𝓦𝓪𝓽𝓮𝓻
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Use Bag of Words to Build Model

In [14]:
# Splitting the data into training and validation set
X = df_bow
y = tweets['gender']

# Use Bag-of-Words Features
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X, y, test_size=0.2)

### Use Logistic Regression

In [15]:
# Fitting on Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_bow, y_train_bow)

LogisticRegression()

In [16]:
# The first part of the list is predicting probabilities for gender:0 (male)
# The second part of the list is predicting probabilities for gender:1 (female)
prediction_bow = logreg.predict_proba(X_test_bow)
prediction_bow

array([[0.85447656, 0.14552344],
       [0.12522754, 0.87477246],
       [0.69917376, 0.30082624],
       ...,
       [0.64009378, 0.35990622],
       [0.40174476, 0.59825524],
       [0.44290804, 0.55709196]])

In [17]:
# Calculating the F1 score
# If prediction is greater than or equal to 0.3 than 1, else 0
# Where 0 is for male tweets and 1 is for female tweets
prediction_int = prediction_bow[:,1]>=0.5

prediction_int = prediction_int.astype(np.int)
prediction_int

# Calculating f1 score
log_bow = f1_score(y_test_bow, prediction_int)

log_bow

0.787564766839378

### Predict with separate test dataset

In [18]:
# Check if there is a fit model
logreg.intercept_, logreg.coef_

(array([0.18884943]),
 array([[-0.50990811,  0.65993565,  0.02384009, ...,  0.15187552,
          0.15187552,  0.15187552]]))

In [20]:
z = df_bow_test
pred = logreg.predict_proba(z)
pred

array([[0.54249864, 0.45750136],
       [0.49800956, 0.50199044],
       [0.59343196, 0.40656804],
       [0.70924953, 0.29075047],
       [0.65759751, 0.34240249]])

In [21]:
pred2 = logreg.predict(z)
pred2

array([0, 1, 0, 0, 0])

In [22]:
df = pd.DataFrame(data=pred)
df

Unnamed: 0,0,1
0,0.542499,0.457501
1,0.49801,0.50199
2,0.593432,0.406568
3,0.70925,0.29075
4,0.657598,0.342402


In [23]:
pred2 = pd.DataFrame(data=pred2, columns=['predicted_gender'])
pred2

Unnamed: 0,predicted_gender
0,0
1,1
2,0
3,0
4,0


In [24]:
testset.join(pred2)

Unnamed: 0.1,Unnamed: 0,user,text,gender,clean_text,length,predicted_gender
0,0,BarackObama,My statement on the death of George Floyd: htt...,0,statement death George Floyd,66,0
1,1,BarackObama,"If you believe in a more just, more generous, ...",0,believe more just more generous more democrati...,140,1
2,2,BarackObama,"On Memorial Day, we honor those who gave all f...",0,Memorial honor those gave That takes different...,140,0
3,3,BarackObama,And here’s more on the approach Sweden has tak...,0,here more approach Sweden taken which differs ...,117,0
4,4,BarackObama,South Korea has focused on testing to guard ag...,0,South Korea focused testing guard against outb...,87,0


### Use TF-IDF

In [None]:
# TF-IDF features (Term Frequency-Inverse Document Frequency)
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(tweets['clean_text'])
df_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names())
df_tfidf

In [None]:
# Do the same for the test dataset
# TF-IDF features (Term Frequency-Inverse Document Frequency)
tfidf_matrix=tfidf.transform(testset['clean_text'])
df_tfidf_test = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names())
df_tfidf_test

### Use TF-IDF to Build Model

In [None]:
# Splitting the data into training and validation set
X = df_tfidf
y = tweets['gender']

# Use Bag-of-Words Features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, y, test_size=0.2)

In [None]:
# Using TF-IDF Features
logreg.fit(X_train_tfidf, y_train_tfidf)

In [None]:
prediction_tfidf = logreg.predict_proba(X_test_tfidf)
prediction_tfidf

In [None]:
# Calculating the F1 score
prediction_int = prediction_tfidf[:,1]>=0.5
prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_tfidf = f1_score(y_test_tfidf, prediction_int)
log_tfidf

### Use Decision Tree

In [None]:
# Decision Tree
dtc = DecisionTreeClassifier(criterion='entropy', random_state=1)

In [None]:
# Using Bag of Words as features
dtc.fit(X_train_bow, y_train_bow)
dtc_bow = dtc.predict_proba(X_test_bow)
dtc_bow

In [None]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dtc_bow = dtc_bow[:,1]>=0.5

# converting the results to integer type
dtc_int_bow=dtc_bow.astype(np.int)

# calculating f1 score
dtc_score_bow=f1_score(y_test_bow, dtc_int_bow)

dtc_score_bow

In [None]:
# Using TF-IDF
dtc.fit(x_train_tfidf,y_train_tfidf)

In [None]:
dtc_tfidf = dtc.predict_proba(X_test_tfidf)

dtc_tfidf

In [None]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dtc_tfidf=dtc_tfidf[:,1]>=0.3

# converting the results to integer type
dtc_int_tfidf=dtc_tfidf.astype(np.int)

# calculating f1 score
dtc_score_tfidf=f1_score(y_test_tfidf,dtc_int_tfidf)

dtc_score_tfidf

In [None]:
# Model Comparison
Algo=['LogisticRegression(Bag-of-Words)','DecisionTree(Bag-of-Words)','LogisticRegression(TF-IDF)','DecisionTree(TF-IDF)']


In [None]:
score = [log_bow,dct_score_bow,log_tfidf,dct_score_tfidf]

compare=pd.DataFrame({'Model':Algo,'F1_Score':score},index=[i for i in range(1,5)])
compare.T


In [None]:
plt.figure(figsize=(18,5))

sns.pointplot(x='Model',y='F1_Score',data=compare)

plt.title('Model Vs Score')
plt.xlabel('MODEL')
plt.ylabel('SCORE')

plt.show()

# Test With Real Text

In [None]:
# Check if there is a fit model

In [None]:
Log_Reg.intercept_, Log_Reg.coef_

In [None]:
test_text = pd.read_csv('../Data/tweetstest.csv')
test_text

In [None]:
bow = bow_vectorizer.transform(test_text['clean_text']) #use .transform() not .fit_transform()
df_bow = pd.DataFrame(bow.todense())
df_bow

In [None]:
prediction_bow = Log_Reg.predict_proba(X)
prediction_bow

In [None]:
test_text = "this is a test tweet to predict my gender baby boo"

# Bag-of-Words feature matrix
bow = bow_vectorizer.fit_transform('test_text')
df_bow = pd.DataFrame(bow.todense())
df_bow

In [None]:
text = "I am so angry"
textBlob = TextBlob(text)
print(f"{textBlob.sentiment}")

# ###### 

#### 