# Supervised Learning Techniques


## Dependencies and classes


In [305]:
from pydantic import BaseModel, Field
from typing import Optional, List
import enum
import json

class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"

class Location(BaseModel):
    city: str
    state_or_province: str
    country: str

class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [306]:
NEWSGROUPS = [
    'alt.atheism',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'sci.med',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

## Loading the data


Load profiles data and store in a dataframe with their true labels.


In [307]:
import pandas as pd

profiles = []

# Load the profiles from the json files
for news_group in NEWSGROUPS:
    
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r", encoding='utf-8') as f:
        
        # get profiles for current news group
        profiles_group = FakeProfiles.from_json(f.read())
        
        # store all profiles in dataframe
        for profile in profiles_group.profiles:
            # convert profile to dict
            profile_dict = profile.model_dump()
            # add news group field (true label for profile)
            profile_dict['news_group'] = news_group
            profiles.append(profile_dict)
            
        
df = pd.DataFrame(profiles)
df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group
0,Thomas Reed,Journalist,Media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,Journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism
1,Samantha Brooks,College Professor,Education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,Philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism
2,Marcus Li,Software Developer,Technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,Computer Science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism
3,Emily Nguyen,Human Rights Activist,NGO,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,International Relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism
4,Jordan Smith,Blogger,Digital Media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,English,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism


## Preparing the data for model fitting


### Preprocessing text


Preprocessing text fields for each profile. Techniques include stopword removal and lemmatization.


In [308]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text_field(text):
    # remove blanks and convert to lower case
    if text is None:
        return 'none'
    else:
        return text.lower().strip()

# more preprocessing for job description
def preprocess_job_description(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_text = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocess fields and add back to the dataframe.


In [309]:
# simple preprocessing for occupation, industry, major
text_fields = ['occupation', 'industry', 'major']
for field in text_fields:
    df[field] = df[field].apply(preprocess_text_field)
    
# preprocessing for job description
df['job_description_preprocessed'] = df['job_description'].apply(preprocess_job_description)
    
# flatten location
df['city'] = df['location'].apply(lambda x: preprocess_text_field(x['city']))
df['state_or_province'] = df['location'].apply(lambda x: preprocess_text_field(x['state_or_province']))
df['country'] = df['location'].apply(lambda x: preprocess_text_field(x['country']))

df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group,job_description_preprocessed,city,state_or_province,country
0,Thomas Reed,journalist,media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism,"Writes article various social topic , includin...",austin,texas,usa
1,Samantha Brooks,college professor,education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism,"Teaches course philosophy , including metaphys...",berkeley,california,usa
2,Marcus Li,software developer,technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,computer science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism,Develops mobile application focus social netwo...,seattle,washington,usa
3,Emily Nguyen,human rights activist,ngo,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,international relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism,Advocates freedom belief expression around wor...,new york,new york,usa
4,Jordan Smith,blogger,digital media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,english,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism,"Runs popular blog discussing religion , atheis...",denver,colorado,usa


## Fitting models to the data


Extract numerical and categorical features from the data.


In [310]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp

# vectorization for job description
vectorizer = TfidfVectorizer()
X_job_description = vectorizer.fit_transform(df['job_description_preprocessed'])

# one-hot encoding for education
X_education = pd.get_dummies(df['education'])

X_occupation = CountVectorizer()

# bag of words features
bow_fields = ['occupation', 'industry', 'major', 'city', 'state_or_province', 'country']
bow_features = {}
for field in bow_fields:
    vectorizer = CountVectorizer()
    bow_features[field] = vectorizer.fit_transform(df[field])
    
# combine covariates
X_job_description_sparse = sp.csr_matrix(X_job_description)
X_categorical_sparse = sp.csr_matrix(X_education)
X_combined = sp.hstack([X_job_description_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

# extract targets
y = df['news_group']

### Multiclass classification


In this section, we fit supervised machine learning models with multiclass outputs. The outputs correspond to news groups, and the probability that a given profile is relevant to each news group.


#### Logistic regression


In [311]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=1)

# 6-fold cross validation
cv_logistic = cross_val_score(logistic_model, X_combined, y, cv=6)
print(f"Logistic regression average prediction score: {cv_logistic.mean():.2f}")

Logistic regression average prediction score: 0.29


#### Naive Bayes


In [312]:
from sklearn.naive_bayes import MultinomialNB

# bernoulli bayes model (binary classification)
nb = MultinomialNB()

# 6-fold cross validation
cv_nb = cross_val_score(nb, X_combined, y, cv=6)
print(f"Naive Bayes average prediction score: {cv_nb.mean():.2f}")

Naive Bayes average prediction score: 0.28


#### Decision tree


In [313]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=1)

# 6-fold cross validation
cv_dt = cross_val_score(dt, X_combined, y, cv=6)
print(f"Decision tree average prediction score: {cv_dt.mean():.2f}")

Decision tree average prediction score: 0.62


#### Support vector machine


In [314]:
from sklearn.svm import SVC

svm = SVC(random_state=1)

# 6-fold cross validation
cv_svm = cross_val_score(svm, X_combined, y, cv=6)
print(f"SVM average prediction score: {cv_svm.mean():.2f}")

SVM average prediction score: 0.28


### Binary classification


In this section, we fit supervised machine learning models with binary outputs. We train separate models for each news group. Each model tries to predict the relevance of a given profile to the model.


Define function to predict a profile's relevance to each news group.


In [315]:
def predict_profile_relevance(features, category_models):
    probs = {}
    for news_group, model in category_models.items():
        # get class probabilities from model
        prob = model.predict(features)
        probs[news_group] = prob
    return probs

Extract news group categories for the labels.


In [321]:
# label is 1 if profile belongs to the news group key, 0 otherwise
binary_labels = {category: (df['news_group'] == category).astype(int) for category in NEWSGROUPS}
binary_labels['alt.atheism']

0      1
1      1
2      1
3      1
4      1
      ..
176    0
177    0
178    0
179    0
180    0
Name: news_group, Length: 181, dtype: int32

#### Logistic regression


In [317]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

logistic_models_per_news_group = {}

for news_group, labels in binary_labels.items():
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=1)
    
    # train model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    # get predictions and accuracy
    y_pred = lr_model.predict(X_test)
    
    # get model accuracy
    print(f'Logistic regression F1 for {news_group}: {f1_score(y_test, y_pred):0.4f}')
    
    # store model
    logistic_models_per_news_group[news_group] = lr_model

Logistic regression F1 for alt.atheism: 0.0000
Logistic regression F1 for comp.windows.x: 0.0000
Logistic regression F1 for misc.forsale: 0.0000
Logistic regression F1 for rec.autos: 0.0000
Logistic regression F1 for sci.med: 0.7500
Logistic regression F1 for rec.sport.hockey: 0.0000
Logistic regression F1 for sci.space: 0.0000
Logistic regression F1 for soc.religion.christian: 0.0000
Logistic regression F1 for talk.politics.guns: 0.0000


#### Naive Bayes

In [318]:
from sklearn.naive_bayes import BernoulliNB

nb_models_per_news_group = {}

for news_group, labels in binary_labels.items():
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=1)
    
    # train model
    nb_model = BernoulliNB()
    nb_model.fit(X_train, y_train)
    
    # get predictions and accuracy
    y_pred = nb_model.predict(X_test)
    
    # get model accuracy
    print(f'Naive Bayes F1 for {news_group}: {f1_score(y_test, y_pred):0.4f}')
    
    # store model
    nb_models_per_news_group[news_group] = nb_model

Naive Bayes F1 for alt.atheism: 0.0000
Naive Bayes F1 for comp.windows.x: 0.0000
Naive Bayes F1 for misc.forsale: 0.0000
Naive Bayes F1 for rec.autos: 0.0000
Naive Bayes F1 for sci.med: 0.0000
Naive Bayes F1 for rec.sport.hockey: 0.0000
Naive Bayes F1 for sci.space: 0.0000
Naive Bayes F1 for soc.religion.christian: 0.0000
Naive Bayes F1 for talk.politics.guns: 0.0000


#### Decision tree

In [319]:
decision_tree_models_per_news_group = {}

for news_group, labels in binary_labels.items():
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=1)
    
    # train model
    dt_model = DecisionTreeClassifier(random_state=1)
    dt_model.fit(X_train, y_train)
    
    # get predictions and accuracy
    y_pred = dt_model.predict(X_test)
    
    # get model accuracy
    print(f'Decision tree F1 for {news_group}: {f1_score(y_test, y_pred):0.4f}')
    
    # store model
    decision_tree_models_per_news_group[news_group] = dt_model

Decision tree F1 for alt.atheism: 0.7692
Decision tree F1 for comp.windows.x: 0.0000
Decision tree F1 for misc.forsale: 0.2222
Decision tree F1 for rec.autos: 0.0000
Decision tree F1 for sci.med: 1.0000
Decision tree F1 for rec.sport.hockey: 1.0000
Decision tree F1 for sci.space: 0.4000
Decision tree F1 for soc.religion.christian: 1.0000
Decision tree F1 for talk.politics.guns: 0.5000


#### Support vector machine

In [320]:
svm_models_per_news_group = {}

for news_group, labels in binary_labels.items():
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=1)
    
    # train model
    svm_model = SVC(random_state=1)
    svm_model.fit(X_train, y_train)
    
    # get predictions and accuracy
    y_pred = svm_model.predict(X_test)
    
    # get model accuracy
    print(f'SVM F1 for {news_group}: {f1_score(y_test, y_pred):0.4f}')
    
    # store model
    svm_models_per_news_group[news_group] = svm_model

SVM F1 for alt.atheism: 0.0000
SVM F1 for comp.windows.x: 0.0000
SVM F1 for misc.forsale: 0.0000
SVM F1 for rec.autos: 0.0000
SVM F1 for sci.med: 0.0000
SVM F1 for rec.sport.hockey: 0.0000
SVM F1 for sci.space: 0.0000
SVM F1 for soc.religion.christian: 0.0000
SVM F1 for talk.politics.guns: 0.0000
