# Goal is to answer the question: can a machine detect a person's gender based on their tweet?

Steps:
1. Get twitter data
2. Clean
3. Analyze and visualize
4. Build model
5. Test model

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import tweepy as tw
import seaborn as sns
import json
import pprint
from matplotlib import pyplot as plt

import spacy
import nltk
from nltk import PorterStemmer
import textblob
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Connect to twitter API
path_auth = '/Users/allenj/Documents/Keys/auth_twitter.json'
auth = json.loads(open(path_auth).read())
pp = pprint.PrettyPrinter(indent=4)

my_consumer_key = auth['my_consumer_key']
my_consumer_secret = auth['my_consumer_secret']
my_access_token = auth['your_access_token']
my_access_token_secret = auth['my_access_token_secret']

auth = tw.OAuthHandler(my_consumer_key, my_consumer_secret)
auth.set_access_token(my_access_token, my_access_token_secret)
api = tw.API(auth)

type(api)

tweepy.api.API

# 1. Get Twitter data

In [90]:
# Upload list of desired users
# Gender 0 = male, 1 = female
users = pd.read_csv('../Data/twitter-users.csv')
users.sample(n=20)

Unnamed: 0,user,name,gender,followers_millions,activity
53,jk_rowling,J.K. Rowling,1,15.0,Author
38,LiamPayne,Liam Payne,0,33.0,Musician
79,chefannc,Anne Cooper,1,0.01,Chef
32,wizkhalifa,Wiz Khalifa,0,36.0,Musician
35,Harry_Styles,Harry Styles,0,34.0,Musician
7,ladygaga,Lady Gaga,1,81.0,Musician
54,carmeloanthony,Carmelo Anthony,0,9.0,Athlete
65,iSmashFizzle,Ashley C. Ford,1,0.19,Author
3,rihanna,Rihanna,1,96.0,Musician
6,realDonaldTrump,Donald Trump,0,82.0,Politician


In [4]:
users['gender'].value_counts()

1    47
0    36
Name: gender, dtype: int64

In [67]:
# Get collection of tweets from these usernames and store it into a new dataframe
list = []

for index, row in users.iterrows():
    tweets = api.user_timeline(screen_name=row['user'], count=200, include_rts=False)
    users_text = [[tweet.user.screen_name, tweet.text, row['gender']] for tweet in tweets]
    tweet_text = pd.DataFrame(data=users_text, 
                        columns=["user", "text", "gender"])
    list.append(tweet_text)

# Merge the list    
tweets = pd.concat(list) 

In [91]:
tweets.sample(20)

Unnamed: 0,user,text,gender,clean_text
30,aaker,@drewlewisjr @dickc @LeslieBlodgett @davidhorn...,1,Um?!
87,developingjen,Really proud of @tristanharris and this new cl...,1,Really proud of and this new clarity and dire...
65,jtimberlake,.@PilgrimageFest is back for year 5 this weeke...,0,. is back for year 5 this weekend. Get out the...
27,kenyanpundit,@tutasema Yes but if there’s any indication it...,1,Yes but if there’s any indication it could go...
198,AdamMGrant,Don't give leaders a pass for the choices they...,0,Don't give leaders a pass for the choices they...
150,jennpozner,@DrMkWalters @jamiaw @lizzwinstead @ZerlinaMax...,1,Thank you!
116,Adele,I really hope Laura Marling wins the mercury t...,1,I really hope Laura Marling wins the mercury t...
62,Adele,Last night was mad! I had such a great time......,1,Last night was mad! I had such a great time......
125,ruthreichl,You'll have to wait for today's Gift Guide sug...,1,You'll have to wait for today's Gift Guide sug...
90,carmeloanthony,.@roadto2022 #Qatar #GameOnDoha #STAYME7O http...,0,. https://t.co/6RJzM3wPgp


In [51]:
# Count the number of datapoints per person
tweets.groupby(["user", "gender"]).size()

user            gender
AdamMGrant      0         200
Adele           1         194
AnushkaSharma   1         156
ArianaGrande    1         107
AvrilLavigne    1         135
                         ... 
staceyannchin   1         168
susanmcp1       1          60
taylorswift13   1         189
unhealthytruth  1         151
wizkhalifa      0          97
Length: 83, dtype: int64

In [78]:
# Check percentages for each gender
# 0 = male, 1 = female
tweets.gender.value_counts(normalize=True, sort=False)

0    0.463736
1    0.536264
Name: gender, dtype: float64

In [79]:
tweets

Unnamed: 0,user,text,gender
0,BarackObama,We’ve seen the power that our voices have when...,0
1,BarackObama,"On National Gun Violence Awareness Day, we #We...",0
2,BarackObama,"Third, every city in this country should be a ...",0
3,BarackObama,"Second, every mayor should review their use of...",0
4,BarackObama,"First, there are specific evidence-based refor...",0
...,...,...,...
190,developingjen,Fascinating. // This is Your Brain on LSD http...,1
191,developingjen,THIS! // Stacy Brown-Philpot tapped as new Tas...,1
192,developingjen,"Sean Parker, a Facebook and Napster Pioneer, t...",1
193,developingjen,"@harper Christian Loffler, Veiled Grey. Massiv...",1


# 2. Clean text

In [81]:
# Clean text by removing things
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

#Remove @, &, # and everything that follows
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['text'], "@[\w]*") # Removes all @
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "&amp;") # Removes all &
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "#[\w]*") # Removes all #
tweets.sample(n=20)

Unnamed: 0,user,text,gender,clean_text
175,developingjen,Best one yet // Why the Conversation About Fur...,1,Best one yet // Why the Conversation About Fur...
176,developingjen,Recent reflections that are really impacting m...,1,Recent reflections that are really impacting m...
177,developingjen,I love this! https://t.co/rRmaDxjqRg,1,I love this! https://t.co/rRmaDxjqRg
178,developingjen,Really great piece by @dgh. So much wisdom. h...,1,Really great piece by . So much wisdom. https...
179,developingjen,The Further Future festival last weekend got m...,1,The Further Future festival last weekend got m...
180,developingjen,@iRowan thank you again for this :).,1,thank you again for this :).
181,developingjen,cc @rosenstein https://t.co/JJk8xKeihx,1,cc https://t.co/JJk8xKeihx
182,developingjen,Thank you @iRowan for making this your cover s...,1,Thank you for making this your cover story fo...
183,developingjen,Fascinating // This Mark Cuban-Backed Startup ...,1,Fascinating // This Mark Cuban-Backed Startup ...
184,developingjen,BBC News - Italian court rules food theft 'not...,1,BBC News - Italian court rules food theft 'not...


In [43]:
import re

In [87]:
tweets.sample(n=20)

Unnamed: 0,user,text,gender,clean_text
147,girlygeekdom,@WorldOfOrdinary @rmcopywriting @treekahlo @Je...,1,I'm seriously considering the self represe...
110,SHAQ,.@djdiesel is now on Twitter https://t.co/hYvw...,0,. is now on Twitter https://t.co/hYvwhfRx3y
136,MariahCarey,@Deborah_Cox Thank you Deborah!! Love you and ...,1,Thank you Deborah!! Love you and your beautif...
85,aliciakeys,New York strong https://t.co/h2NK8oL8kl,1,New York strong https://t.co/h2NK8oL8kl
88,ladygaga,https://t.co/G9sEK0Uo6r,1,https://t.co/G9sEK0Uo6r
150,jimmyfallon,It's time for Tonight Show: At Home Edition Ha...,0,It's time for Tonight Show: At Home Edition Ha...
11,Oprah,Sign up at https://t.co/B35nsuXHrX and I’ll se...,1,Sign up at https://t.co/B35nsuXHrX and I’ll se...
111,LilTunechi,IT’S THAT GKUA Ultra Premium @gkuaofficial #th...,0,IT’S THAT GKUA Ultra Premium https://t.co/w...
57,AdamMGrant,"For every Tiger Woods who specializes early, t...",0,"For every Tiger Woods who specializes early, t..."
52,ruthreichl,Why restaurants matter - even to those who don...,1,Why restaurants matter - even to those who don...


In [94]:
tweets['clean_text'] = tweets['clean_text'].str.extract('(.*)http?')
tweets.sample(20)

Unnamed: 0,user,text,gender,clean_text
16,susanmcp1,"Update, we raised $12.5k from donations means ...",1,
131,NiallOfficial,thanks @applemusic for putting me on the cover...,0,
72,SrBachchan,@SwetaLoveAB @artistrishika 🌹,0,
143,selenagomez,And the final song on Rare… A Sweeter Place ft...,1,
39,NiallOfficial,@cariadoresyou Hahah no worries,0,
108,JLo,¡Gracias @PeopleEnEspanol! ✨🖤🤍 ✨ Los 50 más be...,1,
92,LilTunechi,We lost a King. 824,0,
195,narendramodi,Today was the 4th interaction with CMs. We con...,0,
25,icecube,It ain’t gonna turn out how they think... http...,0,
23,drdre,Can’t wait to have @QuincyDJones on #ThePharma...,0,


In [95]:
tweets

Unnamed: 0,user,text,gender,clean_text
0,BarackObama,We’ve seen the power that our voices have when...,0,
1,BarackObama,"On National Gun Violence Awareness Day, we #We...",0,
2,BarackObama,"Third, every city in this country should be a ...",0,
3,BarackObama,"Second, every mayor should review their use of...",0,
4,BarackObama,"First, there are specific evidence-based refor...",0,
...,...,...,...,...
190,developingjen,Fascinating. // This is Your Brain on LSD http...,1,
191,developingjen,THIS! // Stacy Brown-Philpot tapped as new Tas...,1,
192,developingjen,"Sean Parker, a Facebook and Napster Pioneer, t...",1,
193,developingjen,"@harper Christian Loffler, Veiled Grey. Massiv...",1,


In [15]:
# Clean text by removing things
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

#Remove @ symbol, URL links, and "&amp;"
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['text'], "@[\w]*") #removes all @
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "&amp;")
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "#[\w]*")
tweets

Unnamed: 0,user,text,gender,clean_text
0,BarackObama,We’ve seen the power that our voices have when...,0,We’ve seen the power that our voices have when...
1,BarackObama,"On National Gun Violence Awareness Day, we #We...",0,"On National Gun Violence Awareness Day, we to..."
2,BarackObama,"Third, every city in this country should be a ...",0,"Third, every city in this country should be a ..."
3,BarackObama,"Second, every mayor should review their use of...",0,"Second, every mayor should review their use of..."
4,BarackObama,"First, there are specific evidence-based refor...",0,"First, there are specific evidence-based refor..."
...,...,...,...,...
190,developingjen,Fascinating. // This is Your Brain on LSD http...,1,Fascinating. // This is Your Brain on LSD http...
191,developingjen,THIS! // Stacy Brown-Philpot tapped as new Tas...,1,THIS! // Stacy Brown-Philpot tapped as new Tas...
192,developingjen,"Sean Parker, a Facebook and Napster Pioneer, t...",1,"Sean Parker, a Facebook and Napster Pioneer, t..."
193,developingjen,"@harper Christian Loffler, Veiled Grey. Massiv...",1,"Christian Loffler, Veiled Grey. Massive Attac..."


In [9]:
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "&amp;")
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "#[\w]*") #removes all hashtags
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "https?:\/\/.*[\r\n]*")
tweets

error: unbalanced parenthesis at position 23

In [None]:
def clean_text(text):
    # reduce multiple spaces and newlines to only one
    text = re.sub(r'(\s\s+|\n\n+)', r'\1', text)
    # remove double quotes
    text = re.sub(r'"', '', text)

    return text

In [None]:
tweets['clean_text'] = tweets['text'].apply(clean_text)
tweets

In [None]:
nlp = spacy.load('en')

def convert_text(text):
    sent = nlp(text)
    ents = {x.text: x for x in sent.ents}
    tokens = []
    for w in sent:
        if w.is_stop or w.is_punct:
            continue
        if w.text in ents:
            tokens.append(w.text)
        else:
            tokens.append(w.lemma_.lower())
    text = ' '.join(tokens)

    return text

In [None]:
tweets['clean_text'] = tweets['clean_text'].apply(convert_text)
tweets

### Remove things

In [None]:
tweets['clean_text'] = tweets['text']
tweets

In [None]:
# Clean text by removing things
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

#Remove @ symbol, URL links, and "&amp;"
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['text'], "@[\w]*") #removes all @
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "&amp;")
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "#[\w]*") #removes all hashtags
tweets['clean_text'] = np.vectorize(remove_pattern)(tweets['clean_text'], "https:\/\/.*[\r\n]*")
tweets

In [None]:
testset = pd.read_csv('../Data/twitter-test.csv')
testset

In [None]:
# Do the same for the testing dataset
# Clean text by removing things
testset['clean_text'] = np.vectorize(remove_pattern)(testset['text'], "@[\w]*") #removes all @
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "https?:\/\/.*[\r\n]*")
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "&amp;")
testset['clean_text'] = np.vectorize(remove_pattern)(testset['clean_text'], "#[\w]*") #removes all hashtags
testset

### Remove punctuations

In [None]:
# Remove punctuation, numbers, and special characters
tweets['clean_text'] = tweets['clean_text'].str.replace("[^a-zA-Z#]", " ")
tweets

In [None]:
# Do the same for the testing dataset
# Remove punctuation, numbers, and special characters
testset['clean_text'] = testset['clean_text'].str.replace("[^a-zA-Z#]", " ")
testset

### Remove stop words

In [None]:
# Remove short words less than 3
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
tweets.head(10)

In [None]:
# Do the same for the testing dataset
# Remove short words less than 2
testset['clean_text'] = testset['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
testset.head(10)

### Create new column to count length of clean text

In [None]:
# Count length of characters
tweets['length'] = tweets['clean_text'].apply(len)
tweets

In [None]:
# Do the same for the testing dataset
# Count length
testset['length'] = testset['clean_text'].apply(len)
testset

### Remove rows in training data that have less than desired text length

In [None]:
plt.hist(tweets['length'])
plt.show()

In [None]:
tweets['length'].describe()

In [None]:
# Remove rows where length <= 30
tweets = tweets[tweets.length > 30]
tweets

In [None]:
plt.hist(tweets['length'])
plt.show()

### Tokenize, stem, and stich back

In [None]:
nlp = spacy.load('en')

In [None]:
def clean_text(text):
    # reduce multiple spaces and newlines to only one
    text = re.sub(r'(\s\s+|\n\n+)', r'\1', text)
    # remove double quotes
    text = re.sub(r'"', '', text)

    return text

In [None]:
tweets['clean_text2'] = tweets['text'].apply(clean_text)
tweets

In [None]:
def convert_text(text):
    sent = nlp(text)
    ents = {x.text: x for x in sent.ents}
    tokens = []
    for w in sent:
        if w.is_stop or w.is_punct:
            continue
        if w.text in ents:
            tokens.append(w.text)
        else:
            tokens.append(w.lemma_.lower())
    text = ' '.join(tokens)

    return text

In [None]:
tweets['clean_text2'] = tweets['text'].apply(convert_text)

In [None]:
tweets

In [None]:
# Google 'pandas' .apply()


In [None]:
# Define function to capitalize all characters
def capitalize(x):
    return x.upper()

In [None]:
test = 'allen'
capitalize(test)
# do this for df column

In [None]:
# lambda is a one use function that you don't need to define

In [None]:
# # Tokenization
# tokenized_tweet = tweets['clean_text'].apply(lambda x: x.split())
# tokenized_tweet.head()

In [None]:
# # Do the same for the testing dataset
# # Tokenization
# tokenized_testset = testset['clean_text'].apply(lambda x: x.split())
# tokenized_testset.head()

In [None]:
# # Stemming
# ps = PorterStemmer()
# tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])
# tokenized_tweet.head()

In [None]:
# # Do the same for the testing dataset
# # Stemming
# ps = PorterStemmer()
# tokenized_testset = tokenized_testset.apply(lambda x: [ps.stem(i) for i in x])
# tokenized_testset.head()

In [None]:
# # Stich tokens back together
# for i in range(len(tokenized_tweet)):
#     tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
          
# testset['clean_text'] = tokenized_tweet
# testset['clean_text']

In [None]:
# # Stich tokens back together
# for i in range(len(tokenized_tweet)):
#     tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
          
# testset['clean_text'] = tokenized_tweet
# testset['clean_text']

# 4. Model Selectioin and Machine Learning

### Bag of Words

In [None]:
# Bag-of-Words features
bow_vectorizer = CountVectorizer(stop_words='english')

# Bag-of-Words feature matrix
bow = bow_vectorizer.fit_transform(tweets['clean_text'])
df_bow = pd.DataFrame(bow.todense(), columns=bow_vectorizer.get_feature_names())
df_bow

In [None]:
# Do the same for test dataset
# Bag-of-Words feature matrix
bow = bow_vectorizer.transform(testset['clean_text'])
df_bow_test = pd.DataFrame(bow.todense(), columns=bow_vectorizer.get_feature_names())
df_bow_test

### Use Bag of Words to Build Model

In [None]:
# Splitting the data into training and validation set
X = df_bow
y = tweets['gender']

# Use Bag-of-Words Features
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X, y, test_size=0.2)

### Use Logistic Regression

In [None]:
# Fitting on Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_bow, y_train_bow)

In [None]:
# The first part of the list is predicting probabilities for gender:0 (male)
# The second part of the list is predicting probabilities for gender:1 (female)
prediction_bow = logreg.predict_proba(X_test_bow)
prediction_bow

In [None]:
# Calculating the F1 score
# If prediction is greater than or equal to 0.3 than 1, else 0
# Where 0 is for male tweets and 1 is for female tweets
prediction_int = prediction_bow[:,1]>=0.5

prediction_int = prediction_int.astype(np.int)
prediction_int

# Calculating f1 score
log_bow = f1_score(y_test_bow, prediction_int)

log_bow

### Predict with separate test dataset

In [None]:
# Check if there is a fit model
logreg.intercept_, logreg.coef_

In [None]:
z = df_bow_test
pred = logreg.predict_proba(z)
pred

In [None]:
pred2 = logreg.predict(z)
pred2

In [None]:
df = pd.DataFrame(data=pred)
df

In [None]:
pred2 = pd.DataFrame(data=pred2, columns=['predicted_gender'])
pred2

In [None]:
testset.join(pred2)

### Use TF-IDF

In [None]:
# TF-IDF features (Term Frequency-Inverse Document Frequency)
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(tweets['clean_text'])
df_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names())
df_tfidf

In [None]:
# Do the same for the test dataset
# TF-IDF features (Term Frequency-Inverse Document Frequency)
tfidf_matrix=tfidf.transform(testset['clean_text'])
df_tfidf_test = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names())
df_tfidf_test

### Use TF-IDF to Build Model

In [None]:
# Splitting the data into training and validation set
X = df_tfidf
y = tweets['gender']

# Use Bag-of-Words Features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, y, test_size=0.2)

In [None]:
# Using TF-IDF Features
logreg.fit(X_train_tfidf, y_train_tfidf)

In [None]:
prediction_tfidf = logreg.predict_proba(X_test_tfidf)
prediction_tfidf

In [None]:
# Calculating the F1 score
prediction_int = prediction_tfidf[:,1]>=0.5
prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_tfidf = f1_score(y_test_tfidf, prediction_int)
log_tfidf

### Use Decision Tree

In [None]:
# Decision Tree
dtc = DecisionTreeClassifier(criterion='entropy', random_state=1)

In [None]:
# Using Bag of Words as features
dtc.fit(X_train_bow, y_train_bow)
dtc_bow = dtc.predict_proba(X_test_bow)
dtc_bow

In [None]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dtc_bow = dtc_bow[:,1]>=0.5

# converting the results to integer type
dtc_int_bow=dtc_bow.astype(np.int)

# calculating f1 score
dtc_score_bow=f1_score(y_test_bow, dtc_int_bow)

dtc_score_bow

In [None]:
# Using TF-IDF
dtc.fit(x_train_tfidf,y_train_tfidf)

In [None]:
dtc_tfidf = dtc.predict_proba(X_test_tfidf)

dtc_tfidf

In [None]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dtc_tfidf=dtc_tfidf[:,1]>=0.3

# converting the results to integer type
dtc_int_tfidf=dtc_tfidf.astype(np.int)

# calculating f1 score
dtc_score_tfidf=f1_score(y_test_tfidf,dtc_int_tfidf)

dtc_score_tfidf

In [None]:
# Model Comparison
Algo=['LogisticRegression(Bag-of-Words)','DecisionTree(Bag-of-Words)','LogisticRegression(TF-IDF)','DecisionTree(TF-IDF)']


In [None]:
score = [log_bow,dct_score_bow,log_tfidf,dct_score_tfidf]

compare=pd.DataFrame({'Model':Algo,'F1_Score':score},index=[i for i in range(1,5)])
compare.T


In [None]:
plt.figure(figsize=(18,5))

sns.pointplot(x='Model',y='F1_Score',data=compare)

plt.title('Model Vs Score')
plt.xlabel('MODEL')
plt.ylabel('SCORE')

plt.show()

# Test With Real Text

In [None]:
# Check if there is a fit model

In [None]:
Log_Reg.intercept_, Log_Reg.coef_

In [None]:
test_text = pd.read_csv('../Data/tweetstest.csv')
test_text

In [None]:
bow = bow_vectorizer.transform(test_text['clean_text']) #use .transform() not .fit_transform()
df_bow = pd.DataFrame(bow.todense())
df_bow

In [None]:
prediction_bow = Log_Reg.predict_proba(X)
prediction_bow

In [None]:
test_text = "this is a test tweet to predict my gender baby boo"

# Bag-of-Words feature matrix
bow = bow_vectorizer.fit_transform('test_text')
df_bow = pd.DataFrame(bow.todense())
df_bow

In [None]:
text = "I am so angry"
textBlob = TextBlob(text)
print(f"{textBlob.sentiment}")

# ###### 

#### 