# Understanding the Role of Gender in Book Reviews

## CS 6471: Computational Social Science - Project
Author: Pratyusha Maiti

Affiliation: Georgia Institute of Technology

### Research Objectives
- Do book reviews by different genders differ in stylistic features?
- Do book reviews by different genders differ w.r.t their content?
- Are book reviews written by males perceived as more useful than book reviews written by females?

In [322]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
!pip install contractions

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/pratyushamaiti/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


You should consider upgrading via the '/opt/homebrew/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [185]:
import contractions 
import re
from nltk.corpus import stopwords
import numpy as np

In [311]:
expand_contractions = True
remove_stopwords = False
format_text = True
tokenize = True
lemmatize = False

def clean_text(text):
    '''Text Preprocessing '''
    
    # Convert words to lower case
    text = text.lower()
    split_text = text.split()
    
    # Expand contractions
    if expand_contractions:
        expanded_words = []   
        for word in split_text:
            expanded_words.append(contractions.fix(word))
        text = " ".join(expanded_words)
        print("Text after contractions removal: ", text)

    # Format words and remove unwanted characters
    if format_text:
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        split_text = text.split()
        stops = set(stopwords.words("english"))
        word = [w for w in split_text if not w in stops]
        text = " ".join(word)
        print("Text after stopwords removal: ", text)

    # Tokenize each word
    if tokenize:
        text =  nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each token
    if lemmatize:
        lemm = nltk.stem.WordNetLemmatizer()
        text = list(map(lambda word:list(map(lemm.lemmatize, word)), text))
    
    return text

In [205]:
import pandas as pd
gendered_data = pd.read_csv('goodreads_names_id_gender.csv')
gendered_data.columns = ['id', 'name', 'username', 'age', 'location', 'joined', 'friends_count','groups_count', 'reviews_count', 'first_name', 'gender', 'user_id']
gendered_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852197 entries, 0 to 852196
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             852197 non-null  int64  
 1   name           850616 non-null  object 
 2   username       252735 non-null  object 
 3   age            242043 non-null  float64
 4   location       665382 non-null  object 
 5   joined         820599 non-null  object 
 6   friends_count  820599 non-null  float64
 7   groups_count   820599 non-null  float64
 8   reviews_count  820599 non-null  float64
 9   first_name     850555 non-null  object 
 10  gender         852197 non-null  object 
 11  user_id        852197 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 78.0+ MB


In [206]:
import gzip
import json

def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [207]:
goodreads_reviews_mystery_thriller_crime = load_data('goodreads_reviews_mystery_thriller_crime.json.gz',1849235)
goodreads_reviews_mystery_thriller_crime_df = pd.DataFrame(goodreads_reviews_mystery_thriller_crime)
goodreads_reviews_mystery_thriller_crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 11 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   book_id       object
 2   review_id     object
 3   rating        int64 
 4   review_text   object
 5   date_added    object
 6   date_updated  object
 7   read_at       object
 8   started_at    object
 9   n_votes       int64 
 10  n_comments    int64 
dtypes: int64(3), object(8)
memory usage: 155.2+ MB


In [235]:
gendered_reviews_mtc = pd.merge(goodreads_reviews_mystery_thriller_crime_df, gendered_data, on='user_id', how='left')

In [236]:
clean_gendered_reviews_mtc = gendered_reviews_mtc.dropna()
# df.dropna(axis=1, inplace=True, how='any')
# df[~df.isin(['NaN', 'NaT']).any(axis=1)]
# gendered_reviews_mtc[~gendered_reviews_mtc.isin(['NaN', 'NaT']).any(axis=1)]
gendered_reviews_mtc.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,...,name,username,age,location,joined,friends_count,groups_count,reviews_count,first_name,gender
0,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
1,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
2,8842281e1d1347389f2ab93d60773d4d,32283133,8e4d61801907e591018bdc3442a9cf2b,0,http://www.telegraph.co.uk/culture/10...,Tue Nov 01 11:09:18 -0700 2016,Tue Nov 01 11:09:44 -0700 2016,,,9,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
3,8842281e1d1347389f2ab93d60773d4d,17860739,022bb6daffa49adc27f6b20b6ebeb37d,4,An amazing and unique creation: JJ Abrams and ...,Wed Mar 26 13:51:30 -0700 2014,Tue Sep 23 01:44:36 -0700 2014,Sun Sep 21 00:00:00 -0700 2014,Sat Jul 26 00:00:00 -0700 2014,7,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
4,8842281e1d1347389f2ab93d60773d4d,8694005,0e317947e1fd341f573192111bb2921d,3,The Name of the Rose is a thrilling Dan Brown-...,Wed Sep 08 01:22:27 -0700 2010,Wed Dec 14 12:30:43 -0800 2016,Mon Aug 10 00:00:00 -0700 2015,Mon Jul 20 00:00:00 -0700 2015,17,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male


In [237]:
gendered_reviews_mtc['gender'].value_counts()

female     954765
unknown    551898
male       312975
Name: gender, dtype: int64

In [238]:
gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1849236 entries, 0 to 1849235
Data columns (total 22 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        object 
 1   book_id        object 
 2   review_id      object 
 3   rating         int64  
 4   review_text    object 
 5   date_added     object 
 6   date_updated   object 
 7   read_at        object 
 8   started_at     object 
 9   n_votes        int64  
 10  n_comments     int64  
 11  id             float64
 12  name           object 
 13  username       object 
 14  age            float64
 15  location       object 
 16  joined         object 
 17  friends_count  float64
 18  groups_count   float64
 19  reviews_count  float64
 20  first_name     object 
 21  gender         object 
dtypes: float64(5), int64(3), object(14)
memory usage: 324.5+ MB


In [240]:
clean_gendered_reviews_mtc = clean_gendered_reviews_mtc.reset_index(drop=True)
clean_gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344428 entries, 0 to 344427
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        344428 non-null  object 
 1   book_id        344428 non-null  object 
 2   review_id      344428 non-null  object 
 3   rating         344428 non-null  int64  
 4   review_text    344428 non-null  object 
 5   date_added     344428 non-null  object 
 6   date_updated   344428 non-null  object 
 7   read_at        344428 non-null  object 
 8   started_at     344428 non-null  object 
 9   n_votes        344428 non-null  int64  
 10  n_comments     344428 non-null  int64  
 11  id             344428 non-null  float64
 12  name           344428 non-null  object 
 13  username       344428 non-null  object 
 14  age            344428 non-null  float64
 15  location       344428 non-null  object 
 16  joined         344428 non-null  object 
 17  friends_count  344428 non-nul

In [310]:
# key = [2, 3, 15, 36, 37]
# for k in key:
#     if clean_gendered_reviews_mtc['review_text'][k]:
#         clean_gendered_reviews_mtc['review_text'][k]
gendered_reviews_mtc['review_text'][323]

"Fun to head back to #1 after reading 3 of the more recent (one more new one yet to read). And interesting fact: I'd always assumed references to prior cases were references to prior books. However, here in the first book are references to prior cases. Either it is just part of the needed context or there will be a new novel set before this first one. Time will tell. \n Also amusing. Here in the first one, cell phones worked in Three pines. In the later ones they don't. \n And as far as I can tell, an inconsistency within this first one: an early scene takes place in the bistro and it is about a delay; later when that delay is explained, it also happened at the bistro, so would have been impossible for the waiting person to have missed it. Small detail. \n One thing I enjoy about Penny'e mysteries is the characterization, and it is present in this early book. And she does well at red herring clues among the real clues."

## Review Data Cleaning



In [213]:
import contractions 
import re
from nltk.corpus import stopwords

expand_contractions = True
remove_stopwords = True
format_text = True
tokenize = True
lemmatize = False

def clean_text(text):
    '''Text Preprocessing '''
    
    # Convert words to lower case
    text = text.lower()
    split_text = text.split()
    
    # Expand contractions
    if expand_contractions:
        expanded_words = []   
        for word in split_text:
            expanded_words.append(contractions.fix(word))
        text = " ".join(expanded_words)
#         print("Text after contractions removal: ", text)

    # Format words and remove unwanted characters
    if format_text:
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
    
    # remove stopwords
    if remove_stopwords:
        split_text = text.split()
        stops = set(stopwords.words("english"))
        word = [w for w in split_text if not w in stops]
        text = " ".join(word)
#         print("Text after stopwords removal: ", text)

    # Tokenize each word
    if tokenize:
        text =  nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each token
    if lemmatize:
        lemm = nltk.stem.WordNetLemmatizer()
        text = list(map(lambda word:list(map(lemm.lemmatize, word)), text))
    
    return text

In [202]:
clean_text(gendered_reviews_mtc['review_text'][3])

['amazing',
 'unique',
 'creation',
 'jj',
 'abrams',
 'doug',
 'dorst',
 'created',
 'reads',
 'like',
 'classic',
 'work',
 'fiction',
 'something',
 'easily',
 'imagine',
 'read',
 'english',
 'class',
 'wrote',
 'intriguing',
 'side',
 'story',
 'margins',
 'grad',
 'student',
 'eric',
 'left',
 'annotated',
 'copy',
 'sot',
 'ship',
 'theseus',
 'library',
 'undergrad',
 'jen',
 'finds',
 'replies',
 'annotations',
 'leads',
 'making',
 'exciting',
 'discoveries',
 'book',
 'also',
 'falling',
 'love',
 'first',
 'thing',
 'book',
 'beautifully',
 'printed',
 'looks',
 'feels',
 'like',
 'classic',
 'book',
 'point',
 'people',
 'would',
 'ask',
 'reading',
 'old',
 'book',
 'marginalia',
 'feels',
 'real',
 'imagine',
 'hard',
 'print',
 'one',
 'cool',
 'keep',
 'shelves',
 'complaint',
 'inserts',
 'cool',
 'idea',
 'fall',
 'time',
 'point',
 'idea',
 'pages',
 'supposed',
 'interesting',
 'thing',
 'book',
 'blueprint',
 'people',
 'discuss',
 'books',
 'sometimes',
 'discuss

In [39]:
!pip install tqdm

You should consider upgrading via the '/opt/homebrew/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [68]:
gendered_reviews_mtc['review_text'][0] = ' '.join(clean_text(gendered_reviews_mtc['review_text'][0]))

In [242]:
from tqdm import tqdm

pd.options.mode.chained_assignment = None
for idx in tqdm(range(clean_gendered_reviews_mtc.shape[0])):
    clean_gendered_reviews_mtc['review_text'][idx] = ' '.join(clean_text(clean_gendered_reviews_mtc['review_text'][idx]))

 13%|█▎        | 44589/344428 [11:20<1:16:17, 65.51it/s]  


KeyboardInterrupt: 

In [None]:
clean_gendered_reviews_mtc.head()

In [243]:
file_name = "cleaned_gendered_reviews_mtc"
clean_gendered_reviews_mtc[:40000].to_csv(file_name, sep='\t', encoding='utf-8')

### Writing Style Analysis

In [294]:
# Features under analysis:
X = []
Y = []
lexical_markers = []
vocabulary_richness = []
complexity = []
hedging = []
use_of_pronouns = []

In [102]:
from nltk.probability import FreqDist

def topN_Freq(data, n: int):
    fdist = FreqDist(data)
#     print(fdist.most_common(n))
    return fdist.most_common(n)

In [260]:
## small set of reviews taken into consideration

small_gendered_reviews_mtc = clean_gendered_reviews_mtc[:40000]
small_gendered_reviews_mtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        40000 non-null  object 
 1   book_id        40000 non-null  object 
 2   review_id      40000 non-null  object 
 3   rating         40000 non-null  int64  
 4   review_text    40000 non-null  object 
 5   date_added     40000 non-null  object 
 6   date_updated   40000 non-null  object 
 7   read_at        40000 non-null  object 
 8   started_at     40000 non-null  object 
 9   n_votes        40000 non-null  int64  
 10  n_comments     40000 non-null  int64  
 11  id             40000 non-null  float64
 12  name           40000 non-null  object 
 13  username       40000 non-null  object 
 14  age            40000 non-null  float64
 15  location       40000 non-null  object 
 16  joined         40000 non-null  object 
 17  friends_count  40000 non-null  float64
 18  groups

In [261]:
small_gendered_reviews_mtc.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,...,name,username,age,location,joined,friends_count,groups_count,reviews_count,first_name,gender
0,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,read fun mystery book sure ever read poirot lo...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
1,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,fun fast paced science fiction thriller read 2...,Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
2,8842281e1d1347389f2ab93d60773d4d,32283133,8e4d61801907e591018bdc3442a9cf2b,0,,Tue Nov 01 11:09:18 -0700 2016,Tue Nov 01 11:09:44 -0700 2016,,,9,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
3,8842281e1d1347389f2ab93d60773d4d,17860739,022bb6daffa49adc27f6b20b6ebeb37d,4,amazing unique creation jj abrams doug dorst c...,Wed Mar 26 13:51:30 -0700 2014,Tue Sep 23 01:44:36 -0700 2014,Sun Sep 21 00:00:00 -0700 2014,Sat Jul 26 00:00:00 -0700 2014,7,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male
4,8842281e1d1347389f2ab93d60773d4d,8694005,0e317947e1fd341f573192111bb2921d,3,name rose thrilling dan brown esque murder mys...,Wed Sep 08 01:22:27 -0700 2010,Wed Dec 14 12:30:43 -0800 2016,Mon Aug 10 00:00:00 -0700 2015,Mon Jul 20 00:00:00 -0700 2015,17,...,Otis Chandler,otis,44.0,"San Francisco, CA",08/2006,2032.0,125.0,1312.0,otis,male


In [112]:
topN_Freq(small_gendered_reviews_mtc['review_text'][44].split(), 5)

[('the', 16), ('and', 14), ('is', 11), ('a', 11), ('to', 10)]

In [262]:
from collections import Counter

result = {}
dic2 = {}
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    
    freqDict = topN_Freq(small_gendered_reviews_mtc['review_text'][idx].split(), 50)
    dic1 = dict(freqDict)
    if idx != 0:
        result = {k: dic1.get(k, 0) + dic2.get(k, 0) for k in set(dic1) | set(dic2)}
    dic2 = dict(Counter(result).most_common(50))

result = dict(Counter(result).most_common(50))
result
    

100%|██████████| 40000/40000 [00:03<00:00, 11127.54it/s]


{'book': 42488,
 'read': 20975,
 'one': 19200,
 'story': 17052,
 'like': 12540,
 'characters': 10530,
 'would': 10109,
 'good': 9667,
 '`': 8278,
 'mystery': 8054,
 'novel': 7244,
 'de': 7165,
 'time': 6965,
 'get': 6059,
 'que': 5984,
 'e': 5814,
 'could': 5626,
 'character': 5494,
 'life': 4319,
 'people': 3917,
 'murder': 3734,
 'though': 3026,
 'lot': 2785,
 'l': 2613,
 'something': 2513,
 'work': 2390,
 'made': 2319,
 'feel': 2247,
 'killer': 2152,
 'world': 2109,
 'felt': 2096,
 'reader': 1922,
 '~': 1590,
 'b': 1496,
 'may': 1434,
 'fy': 1378,
 'mn': 1376,
 'house': 1339,
 'di': 1313,
 'dan': 1283,
 'yang': 1130,
 'holmes': 1106,
 'jack': 1034,
 'told': 837,
 'w': 825,
 'nick': 609,
 'lrwy': 474,
 'aku': 377,
 'te': 355,
 'maddie': 131}

In [121]:
result2 = result
result2

{'the': 9253538,
 'I': 6,
 'of': 3123155,
 'and': 4645974,
 'is': 1413487,
 'a': 4317474,
 'to': 11}

In [339]:
lexical_markers = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    freq = Counter(small_gendered_reviews_mtc['review_text'][idx].split())
    lexical_markers.append(sum(freq.get(k, 0) for k in set(result)))
lexical_markers

100%|██████████| 40000/40000 [00:00<00:00, 52939.33it/s]


[10,
 34,
 0,
 40,
 26,
 7,
 6,
 9,
 4,
 0,
 5,
 17,
 6,
 1,
 2,
 2,
 2,
 4,
 7,
 29,
 1,
 4,
 16,
 2,
 3,
 2,
 6,
 18,
 12,
 21,
 13,
 3,
 4,
 12,
 11,
 11,
 5,
 5,
 7,
 0,
 0,
 4,
 16,
 1,
 7,
 3,
 4,
 8,
 7,
 30,
 6,
 6,
 10,
 7,
 1,
 0,
 1,
 8,
 1,
 33,
 31,
 56,
 37,
 53,
 35,
 40,
 66,
 24,
 18,
 41,
 20,
 22,
 36,
 9,
 34,
 44,
 23,
 53,
 52,
 36,
 74,
 1,
 28,
 63,
 23,
 19,
 8,
 0,
 131,
 17,
 9,
 188,
 62,
 4,
 2,
 7,
 8,
 18,
 0,
 7,
 4,
 1,
 1,
 1,
 1,
 3,
 2,
 6,
 1,
 1,
 7,
 7,
 1,
 2,
 3,
 1,
 8,
 3,
 1,
 5,
 20,
 6,
 7,
 2,
 3,
 6,
 1,
 2,
 1,
 2,
 8,
 2,
 7,
 8,
 1,
 0,
 1,
 4,
 6,
 1,
 0,
 3,
 0,
 9,
 2,
 2,
 1,
 7,
 14,
 7,
 9,
 5,
 4,
 28,
 11,
 6,
 2,
 12,
 4,
 3,
 10,
 8,
 13,
 11,
 13,
 6,
 6,
 4,
 2,
 2,
 19,
 17,
 4,
 1,
 4,
 1,
 10,
 1,
 13,
 4,
 12,
 12,
 17,
 7,
 4,
 28,
 13,
 14,
 8,
 1,
 11,
 7,
 9,
 7,
 13,
 9,
 0,
 5,
 1,
 4,
 0,
 3,
 3,
 5,
 3,
 8,
 3,
 8,
 9,
 8,
 11,
 8,
 9,
 6,
 12,
 6,
 7,
 10,
 10,
 9,
 12,
 2,
 9,
 6,
 4,
 12,
 6,
 15,
 9,
 8,
 1,

In [73]:
from nltk.tokenize import word_tokenize, sent_tokenize

def avgSentLenghtByWord(text):
    tokens = sent_tokenize(text)
    return np.average([len(token.split()) for token in tokens])

In [75]:
def countFunctionalWords(text):
    functional_words = """a between in nor some upon
    about both including nothing somebody us
    above but inside of someone used
    after by into off something via
    all can is on such we
    although cos it once than what
    am do its one that whatever
    among down latter onto the when
    an each less opposite their where
    and either like or them whether
    another enough little our these which
    any every lots outside they while
    anybody everybody many over this who
    anyone everyone me own those whoever
    anything everything more past though whom
    are few most per through whose
    around following much plenty till will
    as for must plus to with
    at from my regarding toward within
    be have near same towards without
    because he need several under worth
    before her neither she unless would
    behind him no should unlike yes
    below i nobody since until you
    beside if none so up your
    """

    functional_words = functional_words.split()
    count = 0

    for i in text:
        if i in functional_words:
            count += 1

    return count / len(text)

In [283]:
# hedgeWords = pd.read_csv("hedgeWords.txt", sep="\n")
hedgefile = open("hedgeWords.txt", "r")
content = hedgefile.read()
hedgeWords = content.split("\n")
hedgefile.close()

hedging = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    hedging.append(sum(x == y for x, y in zip(small_gendered_reviews_mtc['review_text'], hedgeWords)))
hedging

100%|██████████| 40000/40000 [00:00<00:00, 48484.20it/s]


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [296]:
import math 

def yulesCharacteristicK(words):
    N = len(words)
    K=0
    if N>0:
        freqs = Counter()
        freqs.update(words)
        vi = Counter()
        vi.update(freqs.values())
        M = sum([(value * value) * vi[value] for key, value in freqs.items()])
        K = 10000 * (M - N) / math.pow(N, 2)
    return K
vocabulary_richness = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    vocabulary_richness.append(yulesCharacteristicK(small_gendered_reviews_mtc['review_text'][idx]))
vocabulary_richness

100%|██████████| 40000/40000 [00:00<00:00, 40711.56it/s]


[993.3412604042807,
 690.9164510112912,
 0,
 807.4050401753104,
 882.3805987423325,
 888.97705078125,
 932.7846364883402,
 825.3972791692426,
 1148.779070856993,
 2812.5,
 876.0952601260431,
 677.9248315721077,
 729.4562437451351,
 2600.0,
 1268.0993750952598,
 1694.5532217871128,
 1261.2244897959183,
 1615.9933361099543,
 980.9770648931488,
 699.8712243467488,
 952.9090527200168,
 746.3377507865891,
 831.7695627697505,
 2743.0555555555557,
 983.8003108901479,
 1772.2951561792665,
 823.7843285462333,
 827.575260043353,
 886.4465564614206,
 746.9982138657411,
 727.2273477881063,
 1289.4110648423557,
 1517.7382723784165,
 757.8913635635315,
 746.5859893886255,
 882.2664444662288,
 868.6370781501679,
 758.5131532619083,
 907.2803015274748,
 2123.456790123457,
 2638.8888888888887,
 730.2934630712696,
 777.8805361048722,
 1440.4432132963989,
 757.8237216541797,
 1030.0759777318722,
 1412.008671882364,
 788.6442284379202,
 744.3374143977167,
 713.618649146826,
 701.2439156300703,
 904.013332

In [364]:
from nltk.tokenize import sent_tokenize

def avg_wordLength(text):
    return np.average([len(word) for word in text.split()])

def avg_SentLenghtByCh(text):
    tokens = sent_tokenize(text)
    return np.average([len(token) for token in tokens])

def avg_SentLenghtByWord(text):
    tokens = sent_tokenize(text)
    return np.average([len(token.split()) for token in tokens])

complexity_1 = []
complexity_2 = []
complexity_3 = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    if len(small_gendered_reviews_mtc['review_text'][idx])>0:
        complexity_1.append(avg_wordLength(small_gendered_reviews_mtc['review_text'][idx]))
        complexity_2.append(avg_SentLenghtByCh(small_gendered_reviews_mtc['review_text'][idx]))
        complexity_3.append(avg_SentLenghtByWord(small_gendered_reviews_mtc['review_text'][idx]))
        
#         complexity.append([avg_wordLength(small_gendered_reviews_mtc['review_text'][idx]), avg_SentLenghtByCh(small_gendered_reviews_mtc['review_text'][idx]), avg_SentLenghtByWord(small_gendered_reviews_mtc['review_text'][idx])])
    else:
        complexity_1.append(0)
        complexity_2.append(0)
        complexity_3.append(0)
complexity_1, complexity_2, complexity_3

100%|██████████| 40000/40000 [00:07<00:00, 5691.23it/s]


([5.767441860465116,
  5.8354978354978355,
  0,
  5.762557077625571,
  6.043715846994536,
  6.138888888888889,
  6.157894736842105,
  5.385542168674699,
  5.628571428571429,
  4.666666666666667,
  5.806451612903226,
  5.584615384615384,
  5.87719298245614,
  4.857142857142857,
  7.2,
  6.714285714285714,
  5.454545454545454,
  5.25,
  4.538461538461538,
  5.41044776119403,
  5.784313725490196,
  6.16,
  5.6477272727272725,
  5.25,
  6.0,
  5.333333333333333,
  5.448979591836735,
  5.338028169014085,
  6.111111111111111,
  6.319148936170213,
  6.170731707317073,
  6.5,
  6.125,
  5.74468085106383,
  6.259740259740259,
  5.468085106382978,
  6.354838709677419,
  6.256410256410256,
  6.416666666666667,
  5.571428571428571,
  5.5,
  6.153846153846154,
  6.263888888888889,
  5.4,
  6.0,
  5.538461538461538,
  5.5,
  5.1,
  5.938775510204081,
  5.6421052631578945,
  5.432835820895522,
  5.953488372093023,
  5.190476190476191,
  5.367647058823529,
  5.0,
  7.0,
  5.5,
  5.575757575757576,
  5

In [329]:
from nltk import word_tokenize, pos_tag

pronouns = []
for idx in tqdm(range(small_gendered_reviews_mtc.shape[0])):
    if len(small_gendered_reviews_mtc['review_text'][idx])>0:
        pronouns.append(sum(1 for word, pos in pos_tag(word_tokenize(small_gendered_reviews_mtc['review_text'][idx]), tagset='universal') if pos =='PRON')/len(word_tokenize(small_gendered_reviews_mtc['review_text'][idx])))
    else:
        pronouns.append(0)
    
pronouns

100%|██████████| 40000/40000 [01:59<00:00, 335.11it/s]


[0.0,
 0.008583690987124463,
 0,
 0.004545454545454545,
 0.005434782608695652,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.011363636363636364,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0027247956403269754,
 0.0027548209366391185,
 0.0,
 0.0,
 0.0,
 0.0026246719160104987,
 0.0031446540880503146,
 0.004784688995215311,
 0.003367003367003367,
 0.0,
 0.0,
 0.0033222591362126247,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0026455026455026454,
 0.0,
 0.011235955056179775,
 0.0018484288354898336,
 0.0,
 0.0,
 0.0025906735751295338,
 0.0,
 0.007194244604316547,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0035149384885764497,
 0.004178272980501393,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.

### Testing out Logistic Regression on TFIDF

In [166]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [379]:
df = pd.DataFrame()

In [374]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   complexity1  40000 non-null  float64
 1   complexity2  40000 non-null  float64
 2   complexity3  40000 non-null  float64
dtypes: float64(3)
memory usage: 937.6 KB


In [345]:
df['frequency'] = lexical_markers

In [297]:
df['hedging'] = hedging

ValueError: Length of values (0) does not match length of index (40000)

In [388]:
df['vocabulary_richness'] = vocabulary_richness

In [384]:
# df['complexity1'] = complexity_1
df['complexity2'] = complexity_2
df['complexity3'] = complexity_3

In [347]:
df['pronouns'] = pronouns

In [332]:
training_data, testing_data = train_test_split(small_gendered_reviews_mtc,random_state = 2000)

In [333]:
training_data.info(), testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 38067 to 27574
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        30000 non-null  object 
 1   book_id        30000 non-null  object 
 2   review_id      30000 non-null  object 
 3   rating         30000 non-null  int64  
 4   review_text    30000 non-null  object 
 5   date_added     30000 non-null  object 
 6   date_updated   30000 non-null  object 
 7   read_at        30000 non-null  object 
 8   started_at     30000 non-null  object 
 9   n_votes        30000 non-null  int64  
 10  n_comments     30000 non-null  int64  
 11  id             30000 non-null  float64
 12  name           30000 non-null  object 
 13  username       30000 non-null  object 
 14  age            30000 non-null  float64
 15  location       30000 non-null  object 
 16  joined         30000 non-null  object 
 17  friends_count  30000 non-null  float64
 18  gr

(None, None)

In [350]:
Y_train=training_data['gender'].values
Y_test=testing_data['gender'].values

In [178]:
def extract_features(df,field, training_data,testing_data):
    """Extract features using different methods""" 
        
    # TF-IDF BASED FEATURE REPRESENTATION
    tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
    tfidf_vectorizer.fit_transform(training_data[field].values)

    train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
    test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)

    return train_feature_set,test_feature_set,tfidf_vectorizer


In [179]:
X_train,X_test,feature_transformer=extract_features(small_gendered_reviews_mtc, 'review_text', training_data,testing_data)



In [180]:
X_train,X_test,feature_transformer

(<2270x17773 sparse matrix of type '<class 'numpy.float64'>'
 	with 108851 stored elements in Compressed Sparse Row format>,
 <757x17773 sparse matrix of type '<class 'numpy.float64'>'
 	with 34208 stored elements in Compressed Sparse Row format>,
 TfidfVectorizer(max_df=0.95))

In [389]:
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=10000)
model=scikit_log_reg.fit(df[:30000],Y_train)

[LibLinear]

In [186]:
def get_top_k_predictions(model,X_test,k):
    
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)
    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]

    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]

    # REVERSE CATEGORIES - DESCENDING ORDER OF IMPORTANCE
    preds=[ item[::-1] for item in preds]

    return preds

In [360]:
probs = model.predict_proba(X_test)

ValueError: X has 17773 features per sample; expecting 3

In [390]:
predicted = model.predict(df[30000:])
predicted

array(['female', 'female', 'female', ..., 'female', 'female', 'female'],
      dtype=object)

In [391]:
from sklearn.metrics import log_loss,accuracy_score

print(("Accuracy  ") + str(accuracy_score(Y_test, predicted)))

Accuracy  0.5311


In [190]:
get_top_k_predictions(model,X_test,5)

[['female', 'male', 'unknown'],
 ['male', 'unknown', 'female'],
 ['female', 'unknown', 'male'],
 ['female', 'male', 'unknown'],
 ['female', 'unknown', 'male'],
 ['female', 'unknown', 'male'],
 ['male', 'female', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'unknown', 'male'],
 ['male', 'female', 'unknown'],
 ['female', 'male', 'unknown'],
 ['male', 'female', 'unknown'],
 ['male', 'female', 'unknown'],
 ['female', 'male', 'unknown'],
 ['male', 'female', 'unknown'],
 ['female', 'unknown', 'male'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'male', 'unknown'],
 ['female', 'unknown', 'male'],
 ['female', 'unknown', 'male'],
 ['female', 'unknown', 'male'],
 ['femal