# Supervised Learning Techniques


## Dependencies and classes


In [186]:
from pydantic import BaseModel, Field
from typing import Optional, List
import enum
import json

class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"

class Location(BaseModel):
    city: str
    state_or_province: str
    country: str

class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [187]:
NEWSGROUPS = [
    'alt.atheism',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'sci.med',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

## Loading the data


Load articles.

In [188]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'sci.med', 'rec.sport.hockey', 'sci.space', 'soc.religion.christian', 'talk.politics.guns']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, random_state=2, return_X_y=True)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, random_state=2, return_X_y=True)

num_articles_newsgroups_train = len(newsgroups_train[0])
num_articles_newsgroups_test = len(newsgroups_test[0])

Load profiles data, pair them with articles, and store in a dataframe with corresponding output (True/False depending on whether the article is relevant to the profile).

In [189]:
import pandas as pd
import numpy as np

profiles = []

# Load the profiles from the json files
for news_group in NEWSGROUPS:
    
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r", encoding='utf-8') as f:
        
        # get profiles for current news group
        profiles_group = FakeProfiles.from_json(f.read())
        
        # store all profiles in dataframe
        for profile in profiles_group.profiles:
            # convert profile to dict
            profile_dict = profile.model_dump()

            # get random article from train set
            index = np.random.choice(num_articles_newsgroups_train)
            random_article = newsgroups_train[0][index]
            article_label = newsgroups_train[1][index]
            
            # determine whether article relevant (same category)
            is_relevant = (article_label == categories.index(news_group))
            
            # attach article and is_relevant to profile
            profile_dict['article'] = random_article
            profile_dict['is_relevant'] = is_relevant
            
            # append profile to list           
            profiles.append(profile_dict)
            
        
df = pd.DataFrame(profiles)
df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,article,is_relevant
0,Thomas Reed,Journalist,Media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,Journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",From: bil@okcforum.osrhe.edu (Bill Conner)\nSu...,True
1,Samantha Brooks,College Professor,Education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,Philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",From: lundby@rtsg.mot.com (Walter F. Lundby)\n...,False
2,Marcus Li,Software Developer,Technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,Computer Science,"{'city': 'Seattle', 'state_or_province': 'Wash...",From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,True
3,Emily Nguyen,Human Rights Activist,NGO,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,International Relations,"{'city': 'New York', 'state_or_province': 'New...",Subject: Space FAQ 01/15 - Introduction\nFrom:...,False
4,Jordan Smith,Blogger,Digital Media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,English,"{'city': 'Denver', 'state_or_province': 'Color...",From: mangoe@cs.umd.edu (Charley Wingate)\nSub...,True


## Preparing the data for model fitting


### Preprocessing text


Preprocessing text fields for each profile. Techniques include stopword removal and lemmatization.


In [190]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_shorter_text(text):
    # remove blanks and convert to lower case
    if text is None:
        return 'none'
    else:
        return text.lower().strip()

# more preprocessing for longer text
def preprocess_longer_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_text = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocess fields and add back to the dataframe.


In [191]:
# simple preprocessing for occupation, industry, major
shorter_text_fields = ['occupation', 'industry', 'major']
for field in shorter_text_fields:
    df[field] = df[field].apply(preprocess_shorter_text)
    
# preprocessing for longer text fields
df['job_description_preprocessed'] = df['job_description'].apply(preprocess_longer_text)
df['article'] = df['article'].apply(preprocess_longer_text)
    
# flatten location
df['city'] = df['location'].apply(lambda x: preprocess_shorter_text(x['city']))
df['state_or_province'] = df['location'].apply(lambda x: preprocess_shorter_text(x['state_or_province']))
df['country'] = df['location'].apply(lambda x: preprocess_shorter_text(x['country']))

df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,article,is_relevant,job_description_preprocessed,city,state_or_province,country
0,Thomas Reed,journalist,media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",From : bil @ okcforum.osrhe.edu ( Bill Conner ...,True,"Writes article various social topic , includin...",austin,texas,usa
1,Samantha Brooks,college professor,education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",From : lundby @ rtsg.mot.com ( Walter F. Lundb...,False,"Teaches course philosophy , including metaphys...",berkeley,california,usa
2,Marcus Li,software developer,technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,computer science,"{'city': 'Seattle', 'state_or_province': 'Wash...",From : I3150101 @ dbstu1.rz.tu-bs.de ( Benedik...,True,Develops mobile application focus social netwo...,seattle,washington,usa
3,Emily Nguyen,human rights activist,ngo,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,international relations,"{'city': 'New York', 'state_or_province': 'New...",Subject : Space FAQ 01/15 - Introduction From ...,False,Advocates freedom belief expression around wor...,new york,new york,usa
4,Jordan Smith,blogger,digital media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,english,"{'city': 'Denver', 'state_or_province': 'Color...",From : mangoe @ cs.umd.edu ( Charley Wingate )...,True,"Runs popular blog discussing religion , atheis...",denver,colorado,usa


## Fitting models to the data


Extract numerical and categorical features from the data.


In [192]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp

# vectorization for job description
vectorizer = TfidfVectorizer()
X_job_description = vectorizer.fit_transform(df['job_description_preprocessed'])
X_article = vectorizer.fit_transform(df['article'])

# one-hot encoding for education
X_education = pd.get_dummies(df['education'])

# count vectorize occupation
X_occupation = CountVectorizer()

# bag of words features
bow_fields = ['occupation', 'industry', 'major', 'city', 'state_or_province', 'country']
bow_features = {}
for field in bow_fields:
    vectorizer = CountVectorizer()
    bow_features[field] = vectorizer.fit_transform(df[field])
    
# combine covariates
X_job_description_sparse = sp.csr_matrix(X_job_description)
X_article_sparse = sp.csr_matrix(X_article)
X_categorical_sparse = sp.csr_matrix(X_education)
X_combined = sp.hstack([X_job_description_sparse, X_article_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

# extract targets
y = df['is_relevant']

### Classification


In this section, we fit supervised machine learning models with binary outputs. The outputs correspond to True/False depending on whether the article in the input is relevant to the profile in the input.

#### Logistic regression


In [193]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=1)

# 6-fold cross validation
cv_logistic = cross_val_score(logistic_model, X_combined, y, cv=6)
print(f"Logistic regression average prediction score: {cv_logistic.mean():.4f}")

Logistic regression average prediction score: 0.8841


#### Naive Bayes


In [194]:
from sklearn.naive_bayes import MultinomialNB

# bernoulli bayes model (binary classification)
nb = MultinomialNB()

# 6-fold cross validation
cv_nb = cross_val_score(nb, X_combined, y, cv=6)
print(f"Naive Bayes average prediction score: {cv_nb.mean():.4f}")

Naive Bayes average prediction score: 0.8841


#### Decision tree


In [195]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=1)

# 6-fold cross validation
cv_dt = cross_val_score(dt, X_combined, y, cv=6)
print(f"Decision tree average prediction score: {cv_dt.mean():.4f}")

Decision tree average prediction score: 0.7785


#### Support vector machine


In [196]:
from sklearn.svm import SVC

svm = SVC(random_state=1)

# 6-fold cross validation
cv_svm = cross_val_score(svm, X_combined, y, cv=6)
print(f"SVM average prediction score: {cv_svm.mean():.4f}")

SVM average prediction score: 0.8841


## Saving metrics

In [197]:
# change if don't want to save metrics to tables/supervised
SAVE_METRICS = True

In [199]:
# model metrics
df_pairwise = pd.DataFrame(index=['Logistic', 'Naive Bayes', 'Decision tree', 'SVM'], 
                             columns=['Average prediction score'])
df_pairwise.loc['Logistic', :] = cv_logistic.mean()
df_pairwise.loc['Naive Bayes', :] = cv_nb.mean()
df_pairwise.loc['Decision tree', :] = cv_dt.mean()
df_pairwise.loc['SVM', :] = cv_svm.mean()

if SAVE_METRICS:
    df_pairwise.to_csv('../tables/supervised/supervised_scores.csv')