# Understanding the Role of Gender in Book Reviews

## CS 6471: Computational Social Science - Project
Author: Pratyusha Maiti

Affiliation: Georgia Institute of Technology

### Research Objectives
- Do book reviews by different genders differ in stylistic features?
- Do book reviews by different genders differ w.r.t their content?
- Are book reviews written by males perceived as more useful than book reviews written by females?

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
!pip install contractions

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


You should consider upgrading via the '/opt/homebrew/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


### Review Data Cleaning

Parameters:
 - Expand contractions
 - Remove stopwords
 - Remove junk and html
 - Tokenize
 - Lemmatize
 

In [2]:
import contractions 
import re
from nltk.corpus import stopwords
import numpy as np

In [22]:
expand_contractions = True
remove_stopwords = False
format_text = True
tokenize = True
lemmatize = False

def clean_text(text):
    '''Text Preprocessing '''
    
    # Convert words to lower case
    text = text.lower()
    split_text = text.split()
    
    # Expand contractions
    if expand_contractions:
        expanded_words = []   
        for word in split_text:
            expanded_words.append(contractions.fix(word))
        text = " ".join(expanded_words)
#         print("Text after contractions removal: ", text)

    # Format words and remove unwanted characters
    if format_text:
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        split_text = text.split()
        stops = set(stopwords.words("english"))
        word = [w for w in split_text if not w in stops]
        text = " ".join(word)
#         print("Text after stopwords removal: ", text)

    # Tokenize each word
    if tokenize:
        text =  nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each token
    if lemmatize:
        lemm = nltk.stem.WordNetLemmatizer()
        text = list(map(lambda word:list(map(lemm.lemmatize, word)), text))
    
    return text

##### Generate gendered review data

- Genre: Crime, Thriller, Mystery
- Joined by userID
- Dropped NaN rows

In [4]:
import pandas as pd
gendered_data = pd.read_csv('goodreads_names_id_gender.csv')
gendered_data.columns = ['id', 'name', 'username', 'age', 'location', 'joined', 'friends_count','groups_count', 'reviews_count', 'first_name', 'gender', 'user_id']
gendered_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852197 entries, 0 to 852196
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             852197 non-null  int64  
 1   name           850616 non-null  object 
 2   username       252735 non-null  object 
 3   age            242043 non-null  float64
 4   location       665382 non-null  object 
 5   joined         820599 non-null  object 
 6   friends_count  820599 non-null  float64
 7   groups_count   820599 non-null  float64
 8   reviews_count  820599 non-null  float64
 9   first_name     850555 non-null  object 
 10  gender         852197 non-null  object 
 11  user_id        852197 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 78.0+ MB


In [5]:
import gzip
import json

def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [6]:
goodreads_reviews_mystery_thriller_crime = load_data('goodreads_reviews_mystery_thriller_crime.json.gz',1849235)
goodreads_reviews_mystery_thriller_crime_df = pd.DataFrame(goodreads_reviews_mystery_thriller_crime)
goodreads_reviews_mystery_thriller_crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 11 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   book_id       object
 2   review_id     object
 3   rating        int64 
 4   review_text   object
 5   date_added    object
 6   date_updated  object
 7   read_at       object
 8   started_at    object
 9   n_votes       int64 
 10  n_comments    int64 
dtypes: int64(3), object(8)
memory usage: 155.2+ MB


In [10]:
gendered_reviews_mtc = pd.merge(goodreads_reviews_mystery_thriller_crime_df, gendered_data, on='user_id', how='left')
gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1849236 entries, 0 to 1849235
Data columns (total 22 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        object 
 1   book_id        object 
 2   review_id      object 
 3   rating         int64  
 4   review_text    object 
 5   date_added     object 
 6   date_updated   object 
 7   read_at        object 
 8   started_at     object 
 9   n_votes        int64  
 10  n_comments     int64  
 11  id             float64
 12  name           object 
 13  username       object 
 14  age            float64
 15  location       object 
 16  joined         object 
 17  friends_count  float64
 18  groups_count   float64
 19  reviews_count  float64
 20  first_name     object 
 21  gender         object 
dtypes: float64(5), int64(3), object(14)
memory usage: 324.5+ MB


Incomplete dataset. Dropped the rows with NaN values in features. We find that this brings out the number of reviews by ~1.5 million reviews

In [11]:
clean_gendered_reviews_mtc = gendered_reviews_mtc.dropna()
clean_gendered_reviews_mtc = clean_gendered_reviews_mtc.reset_index(drop=True)
clean_gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344428 entries, 0 to 344427
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        344428 non-null  object 
 1   book_id        344428 non-null  object 
 2   review_id      344428 non-null  object 
 3   rating         344428 non-null  int64  
 4   review_text    344428 non-null  object 
 5   date_added     344428 non-null  object 
 6   date_updated   344428 non-null  object 
 7   read_at        344428 non-null  object 
 8   started_at     344428 non-null  object 
 9   n_votes        344428 non-null  int64  
 10  n_comments     344428 non-null  int64  
 11  id             344428 non-null  float64
 12  name           344428 non-null  object 
 13  username       344428 non-null  object 
 14  age            344428 non-null  float64
 15  location       344428 non-null  object 
 16  joined         344428 non-null  object 
 17  friends_count  344428 non-nul

In [16]:
clean_gendered_reviews_mtc['gender'].value_counts()

female    177160
male       72531
Name: gender, dtype: int64

Dropped the rows with *unknown gender type*. We find that ratio of female to male reviewers in the test dataset is ~ 2.5:1

In [15]:
clean_gendered_reviews_mtc = clean_gendered_reviews_mtc[clean_gendered_reviews_mtc.gender != 'unknown']
clean_gendered_reviews_mtc = clean_gendered_reviews_mtc.reset_index(drop=True)
clean_gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249691 entries, 0 to 249690
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        249691 non-null  object 
 1   book_id        249691 non-null  object 
 2   review_id      249691 non-null  object 
 3   rating         249691 non-null  int64  
 4   review_text    249691 non-null  object 
 5   date_added     249691 non-null  object 
 6   date_updated   249691 non-null  object 
 7   read_at        249691 non-null  object 
 8   started_at     249691 non-null  object 
 9   n_votes        249691 non-null  int64  
 10  n_comments     249691 non-null  int64  
 11  id             249691 non-null  float64
 12  name           249691 non-null  object 
 13  username       249691 non-null  object 
 14  age            249691 non-null  float64
 15  location       249691 non-null  object 
 16  joined         249691 non-null  object 
 17  friends_count  249691 non-nul

In [17]:
gendered_reviews_mtc['review_text'][323]

"Fun to head back to #1 after reading 3 of the more recent (one more new one yet to read). And interesting fact: I'd always assumed references to prior cases were references to prior books. However, here in the first book are references to prior cases. Either it is just part of the needed context or there will be a new novel set before this first one. Time will tell. \n Also amusing. Here in the first one, cell phones worked in Three pines. In the later ones they don't. \n And as far as I can tell, an inconsistency within this first one: an early scene takes place in the bistro and it is about a delay; later when that delay is explained, it also happened at the bistro, so would have been impossible for the waiting person to have missed it. Small detail. \n One thing I enjoy about Penny'e mysteries is the characterization, and it is present in this early book. And she does well at red herring clues among the real clues."

### Feature Engineering

- Generate clean review data on the reduced dataset
- Generate writing style features
- Generate content features


In [23]:
clean_text(clean_gendered_reviews_mtc['review_text'][3])

['an',
 'amazing',
 'and',
 'unique',
 'creation',
 'jj',
 'abrams',
 'and',
 'doug',
 'dorst',
 'created',
 'what',
 'reads',
 'like',
 'a',
 'classic',
 'work',
 'of',
 'fiction',
 'something',
 'you',
 'can',
 'easily',
 'imagine',
 'having',
 'read',
 'in',
 'english',
 'class',
 'and',
 'then',
 'wrote',
 'a',
 'intriguing',
 'side',
 'story',
 'in',
 'the',
 'margins',
 'a',
 'grad',
 'student',
 'eric',
 'has',
 'left',
 'his',
 'annotated',
 'copy',
 'of',
 'sot',
 'ship',
 'of',
 'theseus',
 'in',
 'the',
 'library',
 'and',
 'an',
 'undergrad',
 'jen',
 'finds',
 'it',
 'and',
 'replies',
 'to',
 'his',
 'annotations',
 'this',
 'leads',
 'to',
 'them',
 'making',
 'exciting',
 'discoveries',
 'about',
 'the',
 'book',
 'and',
 'also',
 'falling',
 'in',
 'love',
 'the',
 'first',
 'thing',
 'is',
 'this',
 'book',
 'is',
 'just',
 'beautifully',
 'printed',
 'it',
 'looks',
 'and',
 'feels',
 'like',
 'a',
 'classic',
 'book',
 'to',
 'the',
 'point',
 'where',
 'people',
 '

In [21]:
!pip install tqdm

You should consider upgrading via the '/opt/homebrew/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [68]:
gendered_reviews_mtc['review_text'][0] = ' '.join(clean_text(gendered_reviews_mtc['review_text'][0]))

In [24]:
from tqdm import tqdm

pd.options.mode.chained_assignment = None
for idx in tqdm(range(clean_gendered_reviews_mtc.shape[0])):
    clean_gendered_reviews_mtc['review_text'][idx] = ' '.join(clean_text(clean_gendered_reviews_mtc['review_text'][idx]))
    

100%|██████████| 249691/249691 [15:16<00:00, 272.55it/s]


In [30]:
clean_gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249691 entries, 0 to 249690
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        249691 non-null  object 
 1   book_id        249691 non-null  object 
 2   review_id      249691 non-null  object 
 3   rating         249691 non-null  int64  
 4   review_text    249691 non-null  object 
 5   date_added     249691 non-null  object 
 6   date_updated   249691 non-null  object 
 7   read_at        249691 non-null  object 
 8   started_at     249691 non-null  object 
 9   n_votes        249691 non-null  int64  
 10  n_comments     249691 non-null  int64  
 11  id             249691 non-null  float64
 12  name           249691 non-null  object 
 13  username       249691 non-null  object 
 14  age            249691 non-null  float64
 15  location       249691 non-null  object 
 16  joined         249691 non-null  object 
 17  friends_count  249691 non-nul

In [31]:
# clean_gendered_reviews_mtc = clean_gendered_reviews_mtc[len(clean_gendered_reviews_mtc.review_text)>0]
# clean_gendered_reviews_mtc = clean_gendered_reviews_mtc.reset_index(drop=True)
# clean_gendered_reviews_mtc.info()

In [32]:
file_name = "cleaned_gendered_reviews_mtc"
clean_gendered_reviews_mtc.to_csv(file_name, sep='\t', encoding='utf-8')

### Writing Style Analysis

In [33]:
# Features under analysis:
lexical_markers = []
vocabulary_richness = []
complexity = []
hedging = []
use_of_pronouns = []

In [34]:
## small set of reviews taken into consideration

small_gendered_reviews_mtc = clean_gendered_reviews_mtc

# small_gendered_reviews_mtc = clean_gendered_reviews_mtc[:40000]
# small_gendered_reviews_mtc.info()

In [112]:
topN_Freq(small_gendered_reviews_mtc['review_text'][44].split(), 5)

[('the', 16), ('and', 14), ('is', 11), ('a', 11), ('to', 10)]

#### Get Top 50 common words in the entire reviewset

In [35]:
from collections import Counter
from nltk.probability import FreqDist

def topN_Freq(data, n: int):
    fdist = FreqDist(data)
    return fdist.most_common(n)

top_50 = {}
dic2 = {}
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    
    freqDict = topN_Freq(small_gendered_reviews_mtc['review_text'][idx].split(), 50)
    dic1 = dict(freqDict)
    if idx != 0:
        result = {k: dic1.get(k, 0) + dic2.get(k, 0) for k in set(dic1) | set(dic2)}
    dic2 = dict(Counter(top_50).most_common(50))

top_50 = dict(Counter(top_50).most_common(50))
len(top_50)
    

100%|██████████| 249691/249691 [00:21<00:00, 11625.64it/s]


0

#### Get all occurances of the top 50 frequent words in each review

In [36]:
lexical_markers = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    freq = Counter(small_gendered_reviews_mtc['review_text'][idx].split())
    lexical_markers.append(sum(freq.get(k, 0) for k in set(result)))
len(lexical_markers)

100%|██████████| 249691/249691 [00:06<00:00, 40114.12it/s]


249691

In [37]:
from nltk.tokenize import word_tokenize, sent_tokenize

def avgSentLenghtByWord(text):
    tokens = sent_tokenize(text)
    return np.average([len(token.split()) for token in tokens])

In [38]:
def countFunctionalWords(text):
    functional_words = """a between in nor some upon
    about both including nothing somebody us
    above but inside of someone used
    after by into off something via
    all can is on such we
    although cos it once than what
    am do its one that whatever
    among down latter onto the when
    an each less opposite their where
    and either like or them whether
    another enough little our these which
    any every lots outside they while
    anybody everybody many over this who
    anyone everyone me own those whoever
    anything everything more past though whom
    are few most per through whose
    around following much plenty till will
    as for must plus to with
    at from my regarding toward within
    be have near same towards without
    because he need several under worth
    before her neither she unless would
    behind him no should unlike yes
    below i nobody since until you
    beside if none so up your
    """

    functional_words = functional_words.split()
    count = 0

    for i in text:
        if i in functional_words:
            count += 1

    return count / len(text)

In [39]:
# hedgeWords = pd.read_csv("hedgeWords.txt", sep="\n")
hedgefile = open("hedgeWords.txt", "r")
content = hedgefile.read()
hedgeWords = content.split("\n")
hedgefile.close()

hedging = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    hedging.append(sum(x == y for x, y in zip(small_gendered_reviews_mtc['review_text'], hedgeWords)))
len(hedging[])

100%|██████████| 249691/249691 [00:05<00:00, 49124.41it/s]


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [40]:
import math 

def yulesCharacteristicK(words):
    N = len(words)
    K=0
    if N>0:
        freqs = Counter()
        freqs.update(words)
        vi = Counter()
        vi.update(freqs.values())
        M = sum([(value * value) * vi[value] for key, value in freqs.items()])
        K = 10000 * (M - N) / math.pow(N, 2)
    return K
vocabulary_richness = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    vocabulary_richness.append(yulesCharacteristicK(small_gendered_reviews_mtc['review_text'][idx]))
len(vocabulary_richness)

100%|██████████| 249691/249691 [00:09<00:00, 27683.48it/s]


249691

In [41]:
from nltk.tokenize import sent_tokenize

def avg_wordLength(text):
    return np.average([len(word) for word in text.split()])

def avg_SentLenghtByCh(text):
    tokens = sent_tokenize(text)
    return np.average([len(token) for token in tokens])

def avg_SentLenghtByWord(text):
    tokens = sent_tokenize(text)
    return np.average([len(token.split()) for token in tokens])

complexity_1 = []
complexity_2 = []
complexity_3 = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    if len(small_gendered_reviews_mtc['review_text'][idx])>0:
        complexity_1.append(avg_wordLength(small_gendered_reviews_mtc['review_text'][idx]))
        complexity_2.append(avg_SentLenghtByCh(small_gendered_reviews_mtc['review_text'][idx]))
        complexity_3.append(avg_SentLenghtByWord(small_gendered_reviews_mtc['review_text'][idx]))
#         complexity.append([avg_wordLength(small_gendered_reviews_mtc['review_text'][idx]), avg_SentLenghtByCh(small_gendered_reviews_mtc['review_text'][idx]), avg_SentLenghtByWord(small_gendered_reviews_mtc['review_text'][idx])])
    else:
        complexity_1.append(0)
        complexity_2.append(0)
        complexity_3.append(0)
len(complexity_1), len(complexity_2), len(complexity_3)



100%|██████████| 249691/249691 [00:53<00:00, 4687.68it/s]


(249691, 249691, 249691)

In [43]:
# from nltk import word_tokenize, pos_tag

# pronouns = []
# for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
#     if len(small_gendered_reviews_mtc['review_text'][idx])>0:
#         pronouns.append(sum(1 for word, pos in pos_tag(word_tokenize(small_gendered_reviews_mtc['review_text'][idx]), tagset='universal') if pos =='PRON')/len(word_tokenize(small_gendered_reviews_mtc['review_text'][idx])))
#     else:
#         pronouns.append(0)
    
# len(pronouns)


In [46]:
from nltk import word_tokenize, pos_tag

pos_pronouns = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    if len(small_gendered_reviews_mtc['review_text'][idx])>0:
        pos_pronouns.append(sum(1 for word, pos in pos_tag(word_tokenize(small_gendered_reviews_mtc['review_text'][idx]), tagset='universal') if pos.startswith('PR'))/len(word_tokenize(small_gendered_reviews_mtc['review_text'][idx])))
    else:
        pos_pronouns.append(0)
    
len(pos_pronouns)

100%|██████████| 249691/249691 [22:50<00:00, 182.21it/s] 


249691

### Testing out Logistic Regression 

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [48]:
df = pd.DataFrame()

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249691 entries, 0 to 249690
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   frequency            249691 non-null  int64  
 1   vocabulary_richness  249691 non-null  float64
 2   pronouns             249691 non-null  float64
 3   complexity1          249691 non-null  float64
 4   complexity2          249691 non-null  float64
 5   complexity3          249691 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 11.4 MB


In [50]:
df['frequency'] = lexical_markers

In [52]:
# df['hedging'] = hedging

In [53]:
df['vocabulary_richness'] = vocabulary_richness

In [56]:
df['complexity1'] = complexity_1
df['complexity2'] = complexity_2
df['complexity3'] = complexity_3

In [54]:
df['pronouns'] = pos_pronouns

In [61]:
training_data, testing_data = train_test_split(small_gendered_reviews_mtc,random_state = 0)

In [62]:
training_data.info(), testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187268 entries, 198069 to 199340
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        187268 non-null  object 
 1   book_id        187268 non-null  object 
 2   review_id      187268 non-null  object 
 3   rating         187268 non-null  int64  
 4   review_text    187268 non-null  object 
 5   date_added     187268 non-null  object 
 6   date_updated   187268 non-null  object 
 7   read_at        187268 non-null  object 
 8   started_at     187268 non-null  object 
 9   n_votes        187268 non-null  int64  
 10  n_comments     187268 non-null  int64  
 11  id             187268 non-null  float64
 12  name           187268 non-null  object 
 13  username       187268 non-null  object 
 14  age            187268 non-null  float64
 15  location       187268 non-null  object 
 16  joined         187268 non-null  object 
 17  friends_count  187268 no

(None, None)

In [60]:
Y_train=training_data['gender'].values
Y_test=testing_data['gender'].values

In [178]:
def extract_features(df,field, training_data,testing_data):
    """Extract features using different methods""" 
        
    # TF-IDF BASED FEATURE REPRESENTATION
    tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
    tfidf_vectorizer.fit_transform(training_data[field].values)

    train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
    test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)

    return train_feature_set,test_feature_set,tfidf_vectorizer


In [179]:
X_train,X_test,feature_transformer=extract_features(small_gendered_reviews_mtc, 'review_text', training_data,testing_data)



In [63]:
Y_train=small_gendered_reviews_mtc[:200000]['gender'].values
Y_test=small_gendered_reviews_mtc[200000:]['gender'].values

In [64]:
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=10000)
model=scikit_log_reg.fit(df[:200000],Y_train)

[LibLinear]

In [65]:
predicted = model.predict(df[200000:])
predicted

array(['female', 'female', 'female', ..., 'female', 'female', 'female'],
      dtype=object)

In [74]:
probs = model.predict_proba(df[200000:])
probs

array([[0.72681076, 0.27318924],
       [0.77564161, 0.22435839],
       [0.71933223, 0.28066777],
       ...,
       [0.70166609, 0.29833391],
       [0.70795074, 0.29204926],
       [0.7862406 , 0.2137594 ]])

In [76]:
from sklearn.metrics import log_loss,accuracy_score

print(("Accuracy  ") + str(accuracy_score(Y_test, predicted)))

Accuracy  0.6819947274154273


In [86]:
Y_test_n = [0 if y == "male" else 1 for y in Y_test]
probs_n = [i[0] for i in probs]

In [87]:
import sklearn.metrics as metrics

def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    
regression_results(Y_test_n, probs_n)

explained_variance:  -0.0045
mean_squared_log_error:  0.108
r2:  -0.0083
MAE:  0.4197
MSE:  0.2175
RMSE:  0.4664


In [None]:
def get_top_k_predictions(model,X_test,k):
    
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)
    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]

    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]

    # REVERSE CATEGORIES - DESCENDING ORDER OF IMPORTANCE
    preds=[ item[::-1] for item in preds]

    return preds

probs = model.predict_proba(X_test)

### Content Features

Perplexity, OOV words, Entropy and other features. Will be explained later :)

In [88]:
import scipy as sc

def ShannonEntropy(text):
    freqs = Counter()
    freqs.update(words)
    arr = np.array(list(freqs.values()))
    distribution = 1. * arr
    distribution /= max(1, len(words))
    
    H = sc.stats.entropy(distribution, base=2)
    return H


In [89]:
def SimpsonsIndex(text):
    freqs = Counter()
    freqs.update(words)
    N = len(words)
    n = sum([1.0 * i * (i - 1) for i in freqs.values()])
    D = 1 - (n / (N * (N - 1)))
    return D