<a href="https://colab.research.google.com/github/SoheilBadri2000/DataScience2/blob/main/03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import  seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import time
import random

# Naive Bayes: Titanic

In [2]:
# Step 1: load the dataset
df = sns.load_dataset("titanic")

In [3]:
# Step 2:EDA
df.shape

(891, 15)

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [6]:
df[["survived", "pclass"]]

Unnamed: 0,survived,pclass
0,0,3
1,1,1
2,1,3
3,1,1
4,0,3
...,...,...
886,0,2
887,1,1
888,0,3
889,1,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [8]:
# Step 3: Preprocessing
# 3.1 null handling
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [9]:
df["age"] = df["age"].fillna(df["age"].mean())

In [10]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [11]:
df["embarked"] = df["embarked"].fillna(df["embarked"].mode()[0])

In [12]:
df["embark_town"] = df["embark_town"].fillna(df["embark_town"].mode()[0])

In [13]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [14]:
# 3.2. drop useless columns
df = df.drop(["alive", "deck", "adult_male", "who", "class", "embarked"], axis=1)

In [15]:
df.shape

(891, 9)

In [16]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,False
1,1,1,female,38.0,1,0,71.2833,Cherbourg,False
2,1,3,female,26.0,0,0,7.925,Southampton,True
3,1,1,female,35.0,1,0,53.1,Southampton,False
4,0,3,male,35.0,0,0,8.05,Southampton,True


In [17]:
# Categorical ---> Numerical
df["embark_town"].nunique()

3

In [18]:
df["embark_town"].unique()

array(['Southampton', 'Cherbourg', 'Queenstown'], dtype=object)

In [19]:
# Method 1
df["sex"].map({"male": 0, "female": 1})
df["embark_town"].map({"Southampton": 0, "Cherbourg": 1, "Queenstown": 2})

0      0
1      1
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    1
890    2
Name: embark_town, Length: 891, dtype: int64

In [20]:
df = pd.get_dummies(df)

In [21]:
df.shape

(891, 12)

In [22]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,False,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,False,1,0,1,0,0
2,1,3,26.0,0,0,7.925,True,1,0,0,0,1
3,1,1,35.0,1,0,53.1,False,1,0,0,0,1
4,0,3,35.0,0,0,8.05,True,0,1,0,0,1


In [23]:
# drop the redundant columns after get_dummies
df = df.drop(["sex_female", "embark_town_Cherbourg"], axis="columns") # or axis=1

In [24]:
# Data and label separation
label = df.survived
data = df.drop(["survived"], axis=1)

In [25]:
data.shape

(891, 9)

In [26]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.25, random_state=42)

In [27]:
# train phase
param_grid = {"var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]}
gnb = GaussianNB()
clf = GridSearchCV(gnb, param_grid, cv=5)
clf.fit(X_train, y_train)

In [28]:
print(clf.best_params_)

{'var_smoothing': 1e-05}


In [29]:
# The model evaluation phase
print(clf.score(X_test, y_test))

0.7937219730941704


In [30]:
lrc = LogisticRegression()
lrc.fit(X_train, y_train)
print(lrc.score(X_test, y_test))

0.7982062780269058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
knnc = KNeighborsClassifier()
knnc.fit(X_train, y_train)
print(knnc.score(X_test, y_test))

0.7040358744394619


# Some Importent Concepts In Python

In [32]:
# prerequisite (enumerate, lambda, list comprehension)
# enumerate
for i in range(10):
  print(i)


0
1
2
3
4
5
6
7
8
9


In [33]:
mylist = [1, 2, 4, 8]
for i in mylist:
  print(i)

1
2
4
8


In [34]:
for i in range(len(mylist)):
  print(i)

0
1
2
3


In [35]:
for i in range(len(mylist)):
  print(i, ":", mylist[i])

0 : 1
1 : 2
2 : 4
3 : 8


In [36]:
# enumerate is an alternative
for idx, val in enumerate(mylist): # enumerate = range + len + slicing
  print(idx, ":", val)

0 : 1
1 : 2
2 : 4
3 : 8


In [37]:
# 2. lambda function
def square_if_even(x):
  if x % 2 == 0:
    return x**2
  else:
    return x

In [38]:
square_if_even(5)

5

In [39]:
square_if_even(4)

16

In [40]:
square = lambda x, y: (x+y)**2
square(5, 2)

49

In [41]:
even_square = lambda x: x**2 if x%2==0 else x
print(even_square(5))
print(even_square(4))

5
16


In [42]:
# 3. List comprehension
mystering = []
for i in range(10):
  mystering.append(i)

print(mystering)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [43]:
mylistcom = [i for i in range(10)]
print(mylistcom)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


# Naive Bayes from Scratch

In [44]:
# camel case: gaussianNaiveBayes
# capitalize: GaussianNaiveBayes ---> class
# underline: gaussian_naive_bayes ---> function

class GaussianNaiveBayes:
  # fit
  def fit(self, X, y):
    n_sample, n_feature = X.shape
    self._classes = np.unique(y)
    n_classes = len(self._classes)
    self._mean = np.zeros((n_classes, n_feature), dtype=np.float32)
    self._var = np.zeros((n_classes, n_feature), dtype=np.float32)
    self._prior = np.zeros(n_classes, dtype=np.float32)

  # calculating mean, variance, and prior
    for i, c in enumerate(self._classes):
      X_for_class_c = X[y==c]
      self._mean[i, :] = X_for_class_c.mean(axis=0)
      self._var[i, :] = X_for_class_c.var(axis=0)
      self._prior[i] = X_for_class_c.shape[0] / float(n_sample)

  # calculating likelihood
  def likelihood(self, class_idx, x):
    mean = self._mean[class_idx]
    var = self._var[class_idx]
    num = np.exp(-(-x-mean)**2 / 2*var) # numerator
    denom = np.sqrt(2 * np.pi * var) # denominator
    return num / denom

  # prediction method
  def predict(self, X):
    y_pred = [self._classify_sample(x) for x in X]
    return np.array(y_pred)

  # classification phase
  def _classify_sample(self, x):
    posteriors = []

    for i, c in enumerate(self._classes):
      pri = np.log(self._prior[i])
      post = np.sum(np.log(self.likelihood(i, x)))
      posterior = pri + post
      posteriors.append(posterior)

    return self._classes[np.argmax(posteriors)]

In [45]:
# load (synthesize data)
X, y = make_classification(n_samples=100000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [46]:
start_time = time.perf_counter()
gnb = GaussianNaiveBayes()
# train phase
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

end_time = time.perf_counter()
print(f"duration of the manual model was: {end_time - start_time}")
print(f"acc for manual model was: {accuracy_score(y_test, y_pred)}")

duration of the manual model was: 2.9542453800000033
acc for manual model was: 0.13984


In [47]:
start_time = time.perf_counter()
sk_gnb = GaussianNB()
# train phase
sk_gnb.fit(X_train, y_train)
y_pred = sk_gnb.predict(X_test)
accuracy_score(y_test, y_pred)

end_time = time.perf_counter()
print(f"duration of the library model was: {end_time - start_time}")
print(f"acc for manual library was: {accuracy_score(y_test, y_pred)}")

duration of the library model was: 0.05763475300000209
acc for manual library was: 0.85908


# Spam Detection

In [53]:
# load data
!gdown --id 1Qg3M7ZfZbt7OByIply-M99Po5M5VkjQH

Downloading...
From: https://drive.google.com/uc?id=1Qg3M7ZfZbt7OByIply-M99Po5M5VkjQH
To: /content/Spam.csv
100% 486k/486k [00:00<00:00, 5.16MB/s]


In [54]:
df = pd.read_csv("Spam.csv")

In [55]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [56]:
label = df["Category"].map({"ham": 0, "spam": 1})
data = df["Message"]

In [57]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=42)

In [58]:
cvec = CountVectorizer()
X_train_count = cvec.fit_transform(X_train.values)

In [59]:
mnb = MultinomialNB()
mnb.fit(X_train_count, y_train)

In [62]:
X_test_count = cvec.transform(X_test)

In [63]:
mnb.score(X_test_count, y_test)

0.9904306220095693

In [64]:
emails = ['Hey Reza! can you join me to watch the football match tomorrow at 10',
          'Upto 20% discount to have the final product just for 20$',
          'Yesterday, we had a promotion for our goods with 34% discount. You can have the XYZ product!',
          'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize.']

emails_count = cvec.transform(emails)
mnb.predict(emails_count)

array([0, 1, 0, 1])