# Supervised Learning Techniques


## Dependencies and classes


In [766]:
from pydantic import BaseModel, Field
from typing import Optional, List
import enum
import json

class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"

class Location(BaseModel):
    city: str
    state_or_province: str
    country: str

class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [767]:
NEWSGROUPS = [
    'alt.atheism',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'sci.med',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

## Loading the data


Load articles.

In [768]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'sci.med', 'rec.sport.hockey', 'sci.space', 'soc.religion.christian', 'talk.politics.guns']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, random_state=2, return_X_y=True)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, random_state=2, return_X_y=True)

num_articles_newsgroups_train = len(newsgroups_train[0])
num_articles_newsgroups_test = len(newsgroups_test[0])

Load profiles data, pair them with articles, and store in a dataframe with corresponding output (True/False depending on whether the article is relevant to the profile).

In [769]:
import pandas as pd
import numpy as np

profiles = []

# Load the profiles from the json files
for news_group in NEWSGROUPS:
    
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r", encoding='utf-8') as f:
        
        # get profiles for current news group
        profiles_group = FakeProfiles.from_json(f.read())
        
        # store all profiles in dataframe
        for profile in profiles_group.profiles:
            # convert profile to dict
            profile_dict = profile.model_dump()

            # attach true news group to profile
            profile_dict['news_group'] = news_group
            
            # get random article from train set
            index = np.random.choice(num_articles_newsgroups_train)
            random_article = newsgroups_train[0][index]
            article_label = newsgroups_train[1][index]
            
            # determine whether article relevant (same category)
            is_relevant = (article_label == categories.index(news_group))
            
            # attach article and is_relevant to profile
            profile_dict['article'] = random_article
            profile_dict['is_relevant'] = is_relevant
            
            # append profile to list           
            profiles.append(profile_dict)
            
        
df = pd.DataFrame(profiles)
df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group,article,is_relevant
0,Thomas Reed,Journalist,Media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,Journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism,From: cdash@moet.cs.colorado.edu (Charles Shub...,False
1,Samantha Brooks,College Professor,Education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,Philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism,Subject: Diffs to sci.space/sci.astro Frequent...,False
2,Marcus Li,Software Developer,Technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,Computer Science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism,From: reid@cs.uiuc.edu (Jon Reid)\nSubject: Ce...,False
3,Emily Nguyen,Human Rights Activist,NGO,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,International Relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism,From: jacquier@gsbux1.uchicago.edu (Eric Jacqu...,False
4,Jordan Smith,Blogger,Digital Media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,English,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism,From: sp1henhj@edit (Henrik Balthazar Hjort)\n...,False


## Preparing the data for model fitting


### Preprocessing text


Preprocessing text fields for each profile. Techniques include stopword removal and lemmatization.


In [770]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_shorter_text(text):
    # remove blanks and convert to lower case
    if text is None:
        return 'none'
    else:
        return text.lower().strip()

# more preprocessing for longer text
def preprocess_longer_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_text = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocess fields and add back to the dataframe.


In [771]:
# simple preprocessing for occupation, industry, major
shorter_text_fields = ['occupation', 'industry', 'major']
for field in shorter_text_fields:
    df[field] = df[field].apply(preprocess_shorter_text)
    
# preprocessing for longer text fields
df['job_description_preprocessed'] = df['job_description'].apply(preprocess_longer_text)
df['article'] = df['article'].apply(preprocess_longer_text)
    
# flatten location
df['city'] = df['location'].apply(lambda x: preprocess_shorter_text(x['city']))
df['state_or_province'] = df['location'].apply(lambda x: preprocess_shorter_text(x['state_or_province']))
df['country'] = df['location'].apply(lambda x: preprocess_shorter_text(x['country']))

df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group,article,is_relevant,job_description_preprocessed,city,state_or_province,country
0,Thomas Reed,journalist,media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism,From : cdash @ moet.cs.colorado.edu ( Charles ...,False,"Writes article various social topic , includin...",austin,texas,usa
1,Samantha Brooks,college professor,education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism,Subject : Diffs sci.space/sci.astro Frequently...,False,"Teaches course philosophy , including metaphys...",berkeley,california,usa
2,Marcus Li,software developer,technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,computer science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism,From : reid @ cs.uiuc.edu ( Jon Reid ) Subject...,False,Develops mobile application focus social netwo...,seattle,washington,usa
3,Emily Nguyen,human rights activist,ngo,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,international relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism,From : jacquier @ gsbux1.uchicago.edu ( Eric J...,False,Advocates freedom belief expression around wor...,new york,new york,usa
4,Jordan Smith,blogger,digital media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,english,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism,From : sp1henhj @ edit ( Henrik Balthazar Hjor...,False,"Runs popular blog discussing religion , atheis...",denver,colorado,usa


## Fitting models to the data


Extract numerical and categorical features from the data.


In [772]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp

# vectorization for job description
vectorizer = TfidfVectorizer()
X_job_description = vectorizer.fit_transform(df['job_description_preprocessed'])
X_article = vectorizer.fit_transform(df['article'])

# one-hot encoding for education
X_education = pd.get_dummies(df['education'])
education_columns = X_education.columns

# bag of words features
bow_fields = ['occupation', 'industry', 'major', 'city', 'state_or_province', 'country']
bow_features = {}
for field in bow_fields:
    vectorizer = CountVectorizer()
    bow_features[field] = vectorizer.fit_transform(df[field])
    
# combine covariates
X_job_description_sparse = sp.csr_matrix(X_job_description)
X_article_sparse = sp.csr_matrix(X_article)
X_categorical_sparse = sp.csr_matrix(X_education)
X_combined = sp.hstack([X_job_description_sparse, X_article_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

# extract targets
y = df['is_relevant']

### General classification


In this section, we fit supervised machine learning models with binary outputs over all news groups. The outputs correspond to True/False depending on whether the article in the input is relevant to the profile in the input.

#### Logistic regression


In [773]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=1)

# 6-fold cross validation
cv_logistic = cross_val_score(logistic_model, X_combined, y, scoring='f1', cv=6)
print(f"Logistic regression average prediction score: {cv_logistic.mean():.4f}")

Logistic regression average prediction score: 0.0000


#### Naive Bayes


In [774]:
from sklearn.naive_bayes import MultinomialNB

# bernoulli bayes model (binary classification)
nb = MultinomialNB()

# 6-fold cross validation
cv_nb = cross_val_score(nb, X_combined, y, scoring='f1', cv=6)
print(f"Naive Bayes average prediction score: {cv_nb.mean():.4f}")

Naive Bayes average prediction score: 0.0000


#### Decision tree


In [775]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=1)

# 6-fold cross validation
cv_dt = cross_val_score(dt, X_combined, y, scoring='f1', cv=6)
print(f"Decision tree average prediction score: {cv_dt.mean():.4f}")

Decision tree average prediction score: 0.0000


#### Support vector machine


In [776]:
from sklearn.svm import SVC

svm = SVC(random_state=1)

# 6-fold cross validation
cv_svm = cross_val_score(svm, X_combined, y, scoring='f1', cv=6)
print(f"SVM average prediction score: {cv_svm.mean():.4f}")

SVM average prediction score: 0.0000


### Per-news group classification


TODO:
1) train a classifier which determines which group an article comes from
2) train different models per news group
3) in test set, try to guess news group of an article and then use corresponding classifier

In this section, we fit supervised machine learning models with binary outputs, one model per news group. The outputs correspond to True/False depending on whether the article in the input is relevant to the profile in the input.


First, we train a classifier that tries to predict the news group of an article.

In [777]:
# vectorize articles
article_classifier_vectorizer = TfidfVectorizer()
X_article_classifier = article_classifier_vectorizer.fit_transform(newsgroups_train[0])
# obtain true labels per article
y_article_classifier = newsgroups_train[1]

# create different classifiers
lr_article_classifier = LogisticRegression(max_iter=1000, random_state=2)
nb_article_classifier = MultinomialNB()
dt_article_classifier = DecisionTreeClassifier(random_state=2)
svm_article_classifier = SVC(random_state=2)

# train-test split
X_article_classifier_train, X_article_classifier_test, \
    y_article_classifier_train, y_article_classifier_test \
        = train_test_split(X_article_classifier, y_article_classifier, \
        test_size=0.2, random_state=2)

# fit classifiers    
lr_article_classifier.fit(X_article_classifier_train, y_article_classifier_train) 
nb_article_classifier.fit(X_article_classifier_train, y_article_classifier_train)
dt_article_classifier.fit(X_article_classifier_train, y_article_classifier_train)
svm_article_classifier.fit(X_article_classifier_train, y_article_classifier_train)

# obtain prediction scores
lr_article_classifier_score = lr_article_classifier.score(X_article_classifier_test, y_article_classifier_test)
nb_article_classifier_score = nb_article_classifier.score(X_article_classifier_test, y_article_classifier_test)
dt_article_classifier_score = dt_article_classifier.score(X_article_classifier_test, y_article_classifier_test)
svm_article_classifier_score = svm_article_classifier.score(X_article_classifier_test, y_article_classifier_test)

print(f'Logistic regression article classifier prediction score: {lr_article_classifier_score}')
print(f'Naive Bayes article classifier prediction score: {nb_article_classifier_score}')
print(f'Decision tree article classifier prediction score: {dt_article_classifier_score}')
print(f'SVM article classifier prediction score: {svm_article_classifier_score}')

Logistic regression article classifier prediction score: 0.9459980713596914
Naive Bayes article classifier prediction score: 0.9218900675024108
Decision tree article classifier prediction score: 0.7540983606557377
SVM article classifier prediction score: 0.944069431051109


Logistic regression obtains the highest prediction score, so we will use that classifier.

In [778]:
article_classifier = lr_article_classifier

In [779]:
# create vectorizer for articles
article_vectorizer = TfidfVectorizer()
article_vectorizer.fit(df['article'])

# remove pre-existing articles and labels from df
df = df.drop(columns=['article', 'is_relevant'])

We now train different models per news group. Each model is trained on arbitrary user profiles and articles from the news group, and tries to predict whether the article is relevant to the profile.

In [780]:
# duplicate df to increase train/test size
# any given profile can be associated with multiple different articles
# (corresponds to different rows in the dataframe)
df_new = pd.concat([df, df, df, df ,df])

In [781]:
models_per_newsgroup = []
models_f1_scores_per_newsgroup = []

# save vectorizers for future use in testing
job_description_vectorizer = TfidfVectorizer()

# fit vectorizers on dataset
job_description_vectorizer.fit(df_new['job_description_preprocessed'])

bow_fields = ['occupation', 'industry', 'major', 'city', 'state_or_province', 'country']
bow_vectorizers = {}
for field in bow_fields:
    vectorizer = CountVectorizer()
    vectorizer.fit(df_new[field])
    bow_vectorizers[field] = vectorizer

num_profiles = len(df_new)

for news_group in NEWSGROUPS:
    
    # fetch indices of articles with label corresponding to the newsgroup
    newsgroup_indices = np.nonzero(newsgroups_train[1] == categories.index(news_group))[0]
    
    # take the articles at the given indices
    newsgroup_articles = np.take(newsgroups_train[0], indices=newsgroup_indices)
    
    # sample articles randomly
    articles = np.random.choice(newsgroup_articles, size=num_profiles)
    
    # add articles to dataframe
    df_new['article'] = articles    
    
    # add is_relevant field
    # True if profile is from the same news group
    df_new['is_relevant'] = df_new['news_group'] == news_group 
    
    # process data
    # vectorization for job description
    X_job_description = job_description_vectorizer.transform(df_new['job_description_preprocessed'])
    X_article = article_vectorizer.transform(df_new['article'])
    
    # one-hot encoding for education
    X_education = pd.get_dummies(df_new['education'])

    # bag of words features
    bow_features = {}
    for field in bow_fields:
        bow_features[field] = bow_vectorizers[field].transform(df_new[field])
        
    # combine covariates
    X_job_description_sparse = sp.csr_matrix(X_job_description)
    X_article_sparse = sp.csr_matrix(X_article)
    X_categorical_sparse = sp.csr_matrix(X_education)
    X_combined = sp.hstack([X_job_description_sparse, X_article_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

    # extract targets
    y = df_new['is_relevant']
    
    # train-test split
    indices = np.arange(num_profiles)
    # X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=2, stratify=y)
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X_combined, y, indices, test_size=0.2, random_state=2, stratify=y)
    
    # create model
    lr = LogisticRegression(max_iter=1000, random_state=2)

    # fit model
    lr.fit(X_train, y_train) 
    
    # obtain predictions
    y_pred_lr = lr.predict(X_test)
    
    # obtain f1 scores
    lr_f1 = f1_score(y_test, y_pred_lr)
    
    # add model to list
    models_per_newsgroup.append(lr)
    models_f1_scores_per_newsgroup.append(lr_f1)

Here are the model F1 scores per news group.

In [782]:
for i in range(len(NEWSGROUPS)):
    print(f'{NEWSGROUPS[i]}: {models_f1_scores_per_newsgroup[i]}')

alt.atheism: 0.9189189189189189
comp.windows.x: 0.9743589743589743
misc.forsale: 1.0
rec.autos: 0.9743589743589743
sci.med: 1.0
rec.sport.hockey: 1.0
sci.space: 1.0
soc.religion.christian: 1.0
talk.politics.guns: 1.0


We now make use of the pipeline to predict relevancy of profiles and articles.

Retrieve test instances.

In [783]:
# choose random articles from newsgroups_test, one for each profile
test_indices = np.random.choice(num_articles_newsgroups_test, size=len(df))

# attach articles and their labels to df
test_articles = np.array(newsgroups_test[0])[test_indices]
test_labels = np.array(newsgroups_test[1])[test_indices]

# store article and article's true news group
df['article'] = test_articles
df['true_article_news_group'] = np.array(categories)[test_labels]

# predict article news group
X_test_articles = article_classifier_vectorizer.transform(df['article'])
predicted_news_groups = article_classifier.predict(X_test_articles)
df['predicted_article_news_group'] = np.array(categories)[predicted_news_groups]

Group data by predicted article news groups and test on each.

In [784]:
grouped = df.groupby('predicted_article_news_group')

true_ys = []
predicted_ys = []

for news_group, group in grouped:
    
    news_group_index = categories.index(news_group)
    model = models_per_newsgroup[news_group_index]
    
    # process data
    # vectorization for job description
    X_job_description = job_description_vectorizer.transform(group['job_description_preprocessed'])
    X_article = article_vectorizer.transform(group['article'])
    
    # one-hot encoding for education
    X_education = pd.get_dummies(group['education']).reindex(columns=education_columns, fill_value=False)

    # bag of words features
    bow_features = {}
    for field in bow_fields:
        bow_features[field] = bow_vectorizers[field].transform(group[field])
        
    # combine covariates
    X_job_description_sparse = sp.csr_matrix(X_job_description)
    X_article_sparse = sp.csr_matrix(X_article)
    X_categorical_sparse = sp.csr_matrix(X_education)
    X_combined = sp.hstack([X_job_description_sparse, X_article_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

    # extract targets
    # True if profile's news group is equal to article news group
    y = group['news_group'] == group['true_article_news_group']
    
    # predict
    y_pred = model.predict(X_combined)
    
    # add values to lists
    true_ys.append(y)
    predicted_ys.append(y_pred)

Get prediction accuracy by comparing the lists.

In [785]:
from sklearn.metrics import accuracy_score

true_y_array = np.concatenate(true_ys)
predicted_y_array = np.concatenate(predicted_ys)
    
acc = accuracy_score(true_y_array, predicted_y_array)
print(f'Prediction accuracy: {acc}')

Prediction accuracy: 0.988950276243094


## Saving metrics

In [786]:
# change if don't want to save metrics to tables/supervised
SAVE_METRICS = True

In [787]:
# model metrics
df_pairwise = pd.DataFrame(index=['Logistic', 'Naive Bayes', 'Decision tree', 'SVM'], 
                             columns=['Average prediction score'])
df_pairwise.loc['Logistic', :] = cv_logistic.mean()
df_pairwise.loc['Naive Bayes', :] = cv_nb.mean()
df_pairwise.loc['Decision tree', :] = cv_dt.mean()
df_pairwise.loc['SVM', :] = cv_svm.mean()

if SAVE_METRICS:
    df_pairwise.to_csv('../tables/supervised/supervised_scores.csv')