In [1]:
# Sean Wendlandt 5/6/23
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, normalize
from scipy.stats import norm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np 
import pandas as pd 
import re, string

#### 1A

In [2]:
income_df = pd.read_csv('income_evaluation_cat.csv',skipinitialspace=True)
income_df.head()

Unnamed: 0,workclass,education,race,gender,income
0,State-gov,Bachelors,White,Male,<=50K
1,Self-emp-not-inc,Bachelors,White,Male,<=50K
2,Private,HS-grad,White,Male,<=50K
3,Private,11th,Black,Male,<=50K
4,Private,Bachelors,Black,Female,<=50K


In [3]:
for col in income_df.columns:
    print(col,':',income_df[col].unique(),'\n')

workclass : ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked'] 

education : ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th'] 

race : ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other'] 

gender : ['Male' 'Female'] 

income : ['<=50K' '>50K'] 



#### 1B

In [4]:
X = ['Private','Bachelors','White','Female']

In [5]:
p_less50 = len(income_df[income_df['income']=='<=50K'])/len(income_df)
print(p_less50)
p_greater50 = len(income_df[income_df['income']=='>50K'])/len(income_df)
print(p_greater50)

0.7591904425539756
0.2408095574460244


In [9]:
def my_nb(in_df, test_case):
    p_less50 = len(in_df[in_df['income'] == '<=50K']) / len(in_df)
    p_greater50 = len(in_df[in_df['income'] == '>50K']) / len(in_df)

    p_work_less = len(in_df.loc[(in_df['workclass'] == test_case[0])].loc[in_df['income'] == '<=50K']) / (p_less50 * len(in_df))
    p_work_greater = len(in_df.loc[(in_df['workclass'] == test_case[0])].loc[in_df['income'] == '>50K']) / (p_greater50 * len(in_df))

    p_edu_less = len(in_df.loc[(in_df['education'] == test_case[1])].loc[in_df['income'] == '<=50K']) / (p_less50 * len(in_df))
    p_edu_greater = len(in_df.loc[(in_df['education'] == test_case[1])].loc[in_df['income'] == '>50K']) / (p_greater50 * len(in_df))

    p_race_less = len(in_df.loc[(in_df['race'] == test_case[2])].loc[in_df['income'] == '<=50K']) / (p_less50 * len(in_df))
    p_race_greater = len(in_df.loc[(in_df['race'] == test_case[2])].loc[in_df['income'] == '>50K']) / (p_greater50 * len(in_df))

    p_gen_less = len(in_df.loc[(in_df['gender'] == test_case[3])].loc[in_df['income'] == '<=50K']) / (p_less50 * len(in_df))
    p_gen_greater = len(in_df.loc[(in_df['gender'] == test_case[3])].loc[in_df['income'] == '>50K']) / (p_greater50 * len(in_df))

    p_dict = {}

    p_dict['<=50K'] = p_less50 * p_work_less * p_edu_less * p_race_less * p_gen_less
    p_dict['>50K'] = p_greater50 * p_work_greater * p_edu_greater * p_race_greater * p_gen_greater

    # Normalize probabilities
    total_prob = sum(p_dict.values())
    p_dict = {k: v / total_prob for k, v in p_dict.items()}

    print(p_dict)

    max_key = '<=50K'
    max_val = p_dict['<=50K']

    for key in p_dict.keys():
        if p_dict[key] > max_val:
            max_key = key
            max_val = p_dict[key]

    return max_key

In [10]:
my_nb(income_df,X)

{'<=50K': 0.7919784735489185, '>50K': 0.20802152645108152}


'<=50K'

#### 1C

In [8]:
income_scaled = pd.DataFrame(income_df,copy=True)
work_label = LabelEncoder()
edu_label = LabelEncoder()
race_label = LabelEncoder()
gender_label = LabelEncoder()
income_label = LabelEncoder()

income_scaled['workclass'] = work_label.fit_transform(income_scaled['workclass'])
income_scaled['education'] = edu_label.fit_transform(income_scaled['education'])
income_scaled['race'] = race_label.fit_transform(income_scaled['race'])
income_scaled['gender'] = gender_label.fit_transform(income_scaled['gender'])
income_scaled['income'] = income_label.fit_transform(income_scaled['income'])

income_scaled

Unnamed: 0,workclass,education,race,gender,income
0,7,9,4,1,0
1,6,9,4,1,0
2,4,11,4,1,0
3,4,1,2,1,0
4,4,9,2,0,0
...,...,...,...,...,...
32556,4,7,4,0,0
32557,4,11,4,1,1
32558,4,11,4,0,0
32559,4,11,4,1,0


#### 1D

In [10]:
X = income_scaled.iloc[:,:-1]
y = income_scaled.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
print(X_test.head(1))
print(y_test.head(2))

       workclass  education  race  gender
20689          4         15     4       1
20689    0
32174    0
Name: income, dtype: int32


#### 1E

In [12]:
cnb = CategoricalNB()

cnb.fit(X_train,y_train)
print('Train:',cnb.score(X_train,y_train))
print('Test:',cnb.score(X_test,y_test))

Train: 0.7854510354510355
Test: 0.7814515303511107


#### 2A

In [13]:
income_cont = pd.read_csv('income_evaluation_continuous.csv',skipinitialspace=True)
income_cont

Unnamed: 0,age,education_num,hours_per_week,income
0,39,13,40,<=50K
1,50,13,13,<=50K
2,38,9,40,<=50K
3,53,7,40,<=50K
4,28,13,40,<=50K
...,...,...,...,...
32556,27,12,38,<=50K
32557,40,9,40,>50K
32558,58,9,40,<=50K
32559,22,9,20,<=50K


In [15]:
df_less50 = income_cont[income_cont["income"] == '<=50K']
df_greater50 = income_cont[income_cont["income"] == '>50K']
print('df_less50 \n',df_less50.apply(func = ["mean","std"]))
print('df_greater50 \n',df_greater50.apply(func = ["mean","std"]))

df_less50 
             age  education_num  hours_per_week
mean  36.783738       9.595065       38.840210
std   14.020088       2.436147       12.318995
df_greater50 
             age  education_num  hours_per_week
mean  44.249841      11.611657       45.473026
std   10.519028       2.385129       11.012971


  print('df_less50 \n',df_less50.apply(func = ["mean","std"]))
  print('df_greater50 \n',df_greater50.apply(func = ["mean","std"]))


#### 2F

In [16]:
income_counts = income_cont.income.value_counts()
    
p_incomes = income_counts/sum(income_counts.values)
    
p_less = p_incomes[0]
print(p_less)
p_greater = p_incomes[1]
print(p_greater)

0.7591904425539756
0.2408095574460244


In [17]:
X = [30, 10, 45]

def n_bayes_cont(training, test, outcome_col):
    
    post_probability = {}
    
    for outcome in training[outcome_col].unique():
        p_yi = training[outcome_col].value_counts()[outcome]/training.shape[0]
        p_x_yi = 1
        
        # p_x_yi calculations
        training_yi = training[training[outcome_col] == outcome]
        for i,x in enumerate(test):
            mean = np.mean(training_yi.iloc[:,i])
            std = np.std(training_yi.iloc[:,i])
            partial_prob = norm.pdf(x, mean, std)
            p_x_yi = p_x_yi*partial_prob
      
        p_yi_x = p_x_yi*p_yi
        post_probability[outcome]=p_yi_x
    # Normalize probabilities and print
    total_prob = sum(post_probability.values())
    for outcome, prob in post_probability.items():
        print("{}: {}".format(outcome, prob/total_prob))

    pred = max(post_probability, key=post_probability.get)
    return pred

n_bayes_cont(income_cont, X, "income")

<=50K: 0.8346035034405684
>50K: 0.16539649655943162


'<=50K'

#### 2B

In [19]:
X = income_cont.iloc[:,:-1]
y = income_cont.iloc[:,-1]

print(X.head())
sc = StandardScaler()
X_std = sc.fit_transform(X)
print('\nStandardized:',X_std[:5])

X_norm = normalize(X)
print('\nNormalized:',X_norm[:5])

   age  education_num  hours_per_week
0   39             13              40
1   50             13              13
2   38              9              40
3   53              7              40
4   28             13              40

Standardized: [[ 0.03067056  1.13473876 -0.03542945]
 [ 0.83710898  1.13473876 -2.22215312]
 [-0.04264203 -0.42005962 -0.03542945]
 [ 1.05704673 -1.19745882 -0.03542945]
 [-0.77576787  1.13473876 -0.03542945]]

Normalized: [[0.67993384 0.22664461 0.69736804]
 [0.93856382 0.24402659 0.24402659]
 [0.67976467 0.16099689 0.71554175]
 [0.79379039 0.10484024 0.59908709]
 [0.55415675 0.25728706 0.79165249]]


#### 2C

In [20]:
X_train_std,X_test_std,y_train_std,y_test_std =train_test_split(X_std,y,test_size= .25)

In [21]:
X_train_norm,X_test_norm,y_train_norm,y_test_norm = train_test_split(X_norm,y,test_size= .25)

#### 2D

In [22]:
nb_std = GaussianNB()
nb_norm = GaussianNB()

nb_std.fit(X_train_std,y_train_std)
print('NB with Standard Scaler:')
print('Train:',nb_std.score(X_train_std,y_train_std))
print('Test:',nb_std.score(X_test_std,y_test_std))
print()

nb_norm.fit(X_train_norm,y_train_norm)
print('NB with normalized data:')
print('Train:',nb_norm.score(X_train_norm,y_train_norm))
print('Test:',nb_norm.score(X_test_norm,y_test_norm))

NB with Standard Scaler:
Train: 0.7988533988533989
Test: 0.8018670924947795

NB with normalized data:
Train: 0.7573710073710074
Test: 0.7646480776317406


#### 3A

In [23]:
true_df = pd.read_csv('True.csv',skipinitialspace=True)
true_df = pd.DataFrame({'title':true_df.title,'text':true_df.text,'news_type':True})
true_df.head(5)

Unnamed: 0,title,text,news_type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,True


#### 3B

In [24]:
fake_df = pd.read_csv('Fake.csv',skipinitialspace=True)
fake_df = pd.DataFrame({'title':fake_df.title,'text':fake_df.text,'news_type':False})
fake_df.head(5)

Unnamed: 0,title,text,news_type
0,Donald Trump Sends Out Embarrassing New Year’s...,Donald Trump just couldn t wish all Americans ...,False
1,Drunk Bragging Trump Staffer Started Russian C...,House Intelligence Committee Chairman Devin Nu...,False
2,Sheriff David Clarke Becomes An Internet Joke ...,"On Friday, it was revealed that former Milwauk...",False
3,Trump Is So Obsessed He Even Has Obama’s Name ...,"On Christmas day, Donald Trump announced that ...",False
4,Pope Francis Just Called Out Donald Trump Duri...,Pope Francis used his annual Christmas Day mes...,False


#### 3C

In [28]:
news_df = pd.concat([true_df, fake_df], ignore_index=True)
news_df['news'] = news_df['title'].str.cat(news_df['text'], sep=' ')
news_df.drop(['title', 'text'], axis=1, inplace=True)
news_df.head()

Unnamed: 0,news_type,news
0,True,"As U.S. budget fight looms, Republicans flip t..."
1,True,U.S. military to accept transgender recruits o...
2,True,Senior U.S. Republican senator: 'Let Mr. Muell...
3,True,FBI Russia probe helped by Australian diplomat...
4,True,Trump wants Postal Service to charge 'much mor...


#### 3D

In [35]:
sw = set(stopwords.words('english'))
def clean(text, stopwords):
    text = re.sub(r'<[^<>]*>',' ',text)
    text_list = text.split()
    text_words = list()
    
    punctuation = set(string.punctuation)
    
    for word in text_list:
        while len(word) > 0 and word[0] in punctuation:
            word = word[1:]
        while len(word) > 0 and word[-1] in punctuation:
            word = word[:-1]
        if len(word) > 0 and '/' not in word:
            if word.lower() not in stopwords:
                text_words.append(word.lower())
        clean_text = ' '.join(text_words)
        
    return clean_text

In [36]:
news_df.news[302]

'Senator Franken, facing resignation calls, to make announcement on Thursday: office (Reuters) - U.S Democratic Senator Al Franken will make an announcement on Thursday, his office said, after several Democratic senators called for him to step down in light of allegations of sexual misconduct against him.  His office offered no further details in a brief statement on Wednesday. '

In [37]:
clean(news_df.news[302],sw)

'senator franken facing resignation calls make announcement thursday office reuters u.s democratic senator al franken make announcement thursday office said several democratic senators called step light allegations sexual misconduct office offered details brief statement wednesday'

In [38]:
news_df.dropna(inplace=True)
news_df.news = news_df.news.apply(clean,stopwords=sw)
mask = news_df.news.str.len() > 50
news_df = news_df[mask]
news_df.head()

Unnamed: 0,news_type,news
0,True,u.s budget fight looms republicans flip fiscal...
1,True,u.s military accept transgender recruits monda...
2,True,senior u.s republican senator let mr mueller j...
3,True,fbi russia probe helped australian diplomat ti...
4,True,trump wants postal service charge much amazon ...


#### 3E

In [39]:
tfidf = TfidfVectorizer(ngram_range=(1,2),stop_words='english',min_df=10,max_features=None)

X = news_df.news
y=news_df["news_type"]

#### 3F

In [40]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#### 3G

In [55]:
model = MultinomialNB()
model.fit(X_train_tfidf,y_train)
MultinomialNB()
print('Train:',model.score(X_train_tfidf,y_train))
print('Test:',model.score(X_test_tfidf,y_test))

Train: 0.9558030669895077
Test: 0.9508097928436912


#### 3H

In [59]:
scores = cross_val_score(estimator=MultinomialNB(),X=X_train_tfidf,y=y_train,cv=5)
print('Avg cross val scores:',scores.mean())
print('Std cross val scores',scores.std())

Avg cross val scores: 0.9479257465698143
Std cross val scores 0.0018050247939944391


#### 3I

In [41]:
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [42]:
pipe = Pipeline([('tfidf',TfidfVectorizer(stop_words='english')),('nb',MultinomialNB())])
param_grid = [{'tfidf__min_df':[6,11,21],'tfidf__ngram_range':[(1,1),(1,2)],'tfidf__norm':['l1','l2'],
              'nb__alpha':[0.6,1.1]}]
grid = GridSearchCV(estimator=pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words='english')),
                                       ('nb', MultinomialNB())]),
             param_grid=[{'nb__alpha': [0.6, 1.1], 'tfidf__min_df': [6, 11, 21],
                          'tfidf__ngram_range': [(1, 1), (1, 2)],
                          'tfidf__norm': ['l1', 'l2']}])

In [44]:
grid.best_params_

{'nb__alpha': 0.6,
 'tfidf__min_df': 6,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2'}

In [45]:
grid.score(X_train,y_train)

0.964681194511703