# Supervised Learning Techniques

## Dependencies and classes

In [131]:
from pydantic import BaseModel, Field
from typing import Optional, List
import enum
import json

class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"

class Location(BaseModel):
    city: str
    state_or_province: str
    country: str

class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [132]:
NEWSGROUPS = [
    'alt.atheism',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'sci.med',
    'rec.sport.hockey',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns'
]

## Loading the data

Load profiles data and store in a dataframe with their true labels.

In [133]:
import pandas as pd

profiles = []

# Load the profiles from the json files
for news_group in NEWSGROUPS:
    
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r", encoding='utf-8') as f:
        
        # get profiles for current news group
        profiles_group = FakeProfiles.from_json(f.read())
        
        # store all profiles in dataframe
        for profile in profiles_group.profiles:
            # convert profile to dict
            profile_dict = profile.model_dump()
            # add news group field (true label for profile)
            profile_dict['news_group'] = news_group
            profiles.append(profile_dict)
            
        
df = pd.DataFrame(profiles)
df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group
0,Thomas Reed,Journalist,Media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,Journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism
1,Samantha Brooks,College Professor,Education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,Philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism
2,Marcus Li,Software Developer,Technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,Computer Science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism
3,Emily Nguyen,Human Rights Activist,NGO,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,International Relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism
4,Jordan Smith,Blogger,Digital Media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,English,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism


## Preparing the data for model fitting

### Preprocessing text

Preprocessing text fields for each profile. Techniques include stopword removal, lemmatization, and vectorization.

In [134]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text_field(text):
    # remove blanks and convert to lower case
    if text is None:
        return 'none'
    else:
        return text.lower().strip()

# more preprocessing for job description
def preprocess_job_description(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_text = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sevag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocess fields and add back to the dataframe.

In [135]:
# simple preprocessing for occupation, industry, major
text_fields = ['occupation', 'industry', 'major']
for field in text_fields:
    df[field] = df[field].apply(preprocess_text_field)
    
# vectorization and preprocessing for job description
df['job_description_preprocessed'] = df['job_description'].apply(preprocess_job_description)
    
# flatten location
df['city'] = df['location'].apply(lambda x: preprocess_text_field(x['city']))
df['state_or_province'] = df['location'].apply(lambda x: preprocess_text_field(x['state_or_province']))
df['country'] = df['location'].apply(lambda x: preprocess_text_field(x['country']))

df.head()

Unnamed: 0,name,occupation,industry,job_description,education,major,location,news_group,job_description_preprocessed,city,state_or_province,country
0,Thomas Reed,journalist,media,"Writes articles on various social topics, incl...",EducationLevels.BACHELORS,journalism,"{'city': 'Austin', 'state_or_province': 'Texas...",alt.atheism,"Writes article various social topic , includin...",austin,texas,usa
1,Samantha Brooks,college professor,education,"Teaches courses on philosophy, including metap...",EducationLevels.PHD,philosophy,"{'city': 'Berkeley', 'state_or_province': 'Cal...",alt.atheism,"Teaches course philosophy , including metaphys...",berkeley,california,usa
2,Marcus Li,software developer,technology,Develops mobile applications with a focus on s...,EducationLevels.BACHELORS,computer science,"{'city': 'Seattle', 'state_or_province': 'Wash...",alt.atheism,Develops mobile application focus social netwo...,seattle,washington,usa
3,Emily Nguyen,human rights activist,ngo,Advocates for freedom of belief and expression...,EducationLevels.MASTERS,international relations,"{'city': 'New York', 'state_or_province': 'New...",alt.atheism,Advocates freedom belief expression around wor...,new york,new york,usa
4,Jordan Smith,blogger,digital media,"Runs a popular blog discussing religion, athei...",EducationLevels.BACHELORS,english,"{'city': 'Denver', 'state_or_province': 'Color...",alt.atheism,"Runs popular blog discussing religion , atheis...",denver,colorado,usa


## Fitting models to the data

Extract numerical and categorical features from the data.

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# vectorization for job description
vectorizer = TfidfVectorizer()
X_job_description = vectorizer.fit_transform(df['job_description_preprocessed'])

# one-hot encoding for education
X_education = pd.get_dummies(df['education'])

X_occupation = CountVectorizer()

# bag of words features
bow_fields = ['occupation', 'industry', 'major', 'city', 'state_or_province', 'country']
bow_features = {}
for field in bow_fields:
    vectorizer = CountVectorizer()
    bow_features[field] = vectorizer.fit_transform(df[field])

Model training.

In [138]:
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# combine covariates
X_job_description_sparse = sp.csr_matrix(X_job_description)
X_categorical_sparse = sp.csr_matrix(X_education)
X_combined = sp.hstack([X_job_description_sparse, X_categorical_sparse] + [bow_features[field] for field in bow_fields])

# extract targets
y = df['news_group']

# # train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=1)

# # logistic regression
logistic_model = LogisticRegression(max_iter=1000)

# 6-fold cross validation
cv = cross_val_score(logistic_model, X_combined, y, cv=6)
print(f"Average prediction score: {cv.mean():.2f}")
print(f"Standard deviation: {cv.std():.2f}")

# logistic_model.fit(X_train, y_train)

# # obtain predictions and evaluate
# y_pred = logistic_model.predict(X_test)
# print(classification_report(y_test, y_pred))

Average prediction score: 0.29
Standard deviation: 0.06
