# Gradient Boosting - XGBoost PYTHON

In [None]:
!pip install xgboost scikit-learn



In [None]:
from xgboost import XGBRegressor
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error

In [None]:
xgb_reg = XGBRegressor(tree_method="hist", eval_metric=mean_squared_error, device="cuda")
X, y = load_diabetes(return_X_y=True)

In [None]:
xgb_reg.fit(X[:420], y[:420])



In [None]:
res = xgb_reg.predict(X[420:])
print(res, "\n\n", y[420:])

[137.00818  207.235    145.76045  177.4831   210.4277    58.489643
 207.03351  123.44159  261.89267   75.91403  108.41253  102.28192
 242.0201    56.245483  99.031906 130.29427   60.432182 217.11139
 114.64255   95.105896 130.35219   98.3319  ] 

 [146. 212. 233.  91. 111. 152. 120.  67. 310.  94. 183.  66. 173.  72.
  49.  64.  48. 178. 104. 132. 220.  57.]


# Random Forest - Python

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=True)
X_train, X_test, y_train, y_test = X[:990], X[990:], y[:990], y[990:]
randforest_class = RandomForestClassifier(max_depth=4, random_state=0)

In [None]:
randforest_class.fit(X_train, y_train)

In [None]:
result = randforest_class.predict(X_test)
print(result, "\n\n", y_test, "\n\n", result==y_test)
print(X)

[1 0 0 1 0 0 0 0 0 1] 

 [1 0 0 1 0 0 0 0 0 1] 

 [ True  True  True  True  True  True  True  True  True  True]
[[-0.78478473 -0.1701223   1.75465033  1.90215562]
 [ 0.53551057 -0.39956886  0.99914735 -0.78605743]
 [ 0.40713033 -0.62855758 -1.14197053 -1.12429904]
 ...
 [-0.16545926  1.19324114  0.79670618 -0.93083317]
 [ 0.14410649  0.23345808 -1.88792247 -1.2825699 ]
 [ 0.67970944 -0.3858315   0.62670574  0.59527126]]


# IMDB classification

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/IMDB Dataset.csv.zip"

Archive:  /content/drive/MyDrive/IMDB Dataset.csv.zip
  inflating: IMDB Dataset.csv        


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data = pd.read_csv("IMDB Dataset.csv")
data = data[:7000]
data['sentiment'].replace('positive', 1, inplace=True)
data['sentiment'].replace('negative', 0, inplace=True)
data['review'] = data['review'].str.lower()
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [None]:
def clean_data(text):
  text = re.sub(r"[^A-Za-z0-9\s]", "", text)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"<.*?>", "", text)
  return text
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [None]:
def remove_stopwords(text):
  stop_words = stopwords.words('english')
  ttext = word_tokenize(text)
  for word in ttext:
    if word in stop_words:
      text = text.replace(word, '')
  return text

In [None]:
def remove_stemming(text):
  stemmer = SnowballStemmer('english')
  tokens = word_tokenize(text)
  res = ""
  for token in tokens:
    res += " " + stemmer.stem(token)
  return res[1:]
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [None]:
data['review'] = data['review'].apply(clean_data)
data['review'] = data['review'].apply(remove_stopwords)
data['review'] = data['review'].apply(remove_stemming)
data.head()

Unnamed: 0,review,sentiment
0,e revew nted wtchg 1 oz epod ll hook y rght ex...,1
1,wder ltle producti br br film techniqu unssum ...,1
2,thought ths wder wy spend tme o hot summer wee...,1
3,bsclli res fmli lttle boy jke thks res zomb cl...,0
4,petter mtte love time mey vulli stunng film wt...,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['review']).toarray()
Y = data['sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
shape = X_train.shape
y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train_set = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
test_set = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

train_loader = DataLoader(train_set, shuffle=True, batch_size=64)
test_loader = DataLoader(test_set, shuffle=True, batch_size=64)

In [None]:
import torch.nn as nn
import torch.optim as optim

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class LSTM(nn.Module):
  def __init__(self, input_s, hidden_s, layers_n, output_s):
    super().__init__()
    self.layers_n = layers_n
    self.hidden_s = hidden_s

    self.lstm = nn.LSTM(input_s, hidden_s, layers_n, batch_first=True)

    self.fc = nn.Linear(hidden_s, output_s)
  def forward(self, x):
    h0 = torch.zeros(self.layers_n, x.size(0), self.hidden_s).to(device).requires_grad_()
    c0 = torch.zeros(self.layers_n, x.size(0), self.hidden_s).to(device).requires_grad_()

    out, _ = self.lstm(x, (h0.detach(), c0.detach()))

    out = self.fc(out[:, -1, :])
    return out

In [None]:
epochs = 7
layers_n = 2
batch_size = 128
output_s = 1
input_s = shape[1]
hidden_s = 128
lr = 0.001

In [None]:
model = LSTM(input_s, hidden_s, layers_n, output_s)
bce_loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(epochs):
  model.train()
  loss = None
  for X_b, y_b in train_loader:
    X_b, y_b = X_b.to(device), y_b.to(device)

    X_b = X_b.unsqueeze(1)
    outputs = model(X_b)
    outputs = torch.sigmoid(outputs.squeeze())

    loss = bce_loss(outputs, y_b)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f"epoch: {epoch} - loss: {loss.item()}")

epoch: 0 - loss: 0.46297258138656616
epoch: 1 - loss: 0.09917920082807541
epoch: 2 - loss: 0.023022837936878204
epoch: 3 - loss: 0.0014312093844637275
epoch: 4 - loss: 0.000816239626146853
epoch: 5 - loss: 0.0003276093630120158
epoch: 6 - loss: 0.00023197894915938377


In [None]:
model.eval()
with torch.no_grad():
  correct = 0
  total = 0
  for X_b, y_b in test_loader:
    X_b, y_b = X_b.to(device), y_b.to(device)

    X_b = X_b.unsqueeze(1)

    outputs = model(X_b)
    predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()

    total += y_b.size(0)
    correct += (predicted == y_b).sum().item()
  acc = correct/total
  print(acc*100)

85.28571428571429




# California housing



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb

In [None]:
df = pd.read_csv("/content/drive/MyDrive/housing.csv")
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
scaler_pt = PowerTransformer()
part_2 = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income", "median_house_value"]
df[part_2] = scaler_pt.fit_transform(df[part_2])
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-2.414735e-15,1.089037,0.975968,-1.158227,-1.572948,-1.619967,-1.521093,1.903289,1.718078,NEAR BAY
1,-2.414735e-15,1.082404,-0.568893,1.896146,1.436871,1.090355,1.633844,1.897081,1.262941,NEAR BAY
2,-2.414735e-15,1.079079,1.76101,-0.534616,-1.130536,-1.139853,-1.136367,1.604361,1.228314,NEAR BAY
3,-2.414735e-15,1.079079,1.76101,-0.713802,-0.870963,-1.00027,-0.878626,1.051321,1.168611,NEAR BAY
4,-2.414735e-15,1.079079,1.76101,-0.399535,-0.647624,-0.985268,-0.665985,0.205878,1.173649,NEAR BAY


In [None]:
X, y = df[["longitude", "latitude", "housing_median_age", "total_rooms", "population", "households", "median_income"]], df["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
model = xgb.XGBRegressor(max_depth=7, device="cuda")
model.fit(X_train, y_train)



In [None]:
y_pred = model.predict(X_test)
r2 = r2_score(y_pred, y_test)
print(r2)

0.8376314392256548


# Churn Model | Python

In [43]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Churn_Modelling.csv")
data = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

In [None]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
def gender2bin(sex):
  return -1 if "Female" else 1

def geo2int(geo):
  country_int = {"Spain": 0, "France": 1, "Germany": 2}
  return country_int[geo]

In [None]:
scaler = PowerTransformer()
indexes = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
data[indexes] = scaler.fit_transform(data[indexes])
data["Gender"] = data["Gender"].apply(gender2bin)
data["Geography"] = data["Geography"].apply(geo2int)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,-0.33545,1,-1,0.451286,-1.027104,-1.324054,-0.968357,1,1,0.11875,1
1,-0.448602,0,-1,0.359732,-1.447834,0.632125,-0.968357,0,1,0.298155,0
2,-1.524793,1,-1,0.451286,1.016561,0.870229,1.820493,1,0,0.320049,1
3,0.495043,1,-1,0.168117,-1.447834,-1.324054,0.936715,0,0,-0.005022,0
4,2.095417,0,-1,0.5402,-1.027104,0.7787,-0.968357,1,1,-0.256295,0


In [None]:
X, y = data[["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Geography", "Gender", "HasCrCard", "IsActiveMember", "EstimatedSalary"]], data["Exited"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), train_size=0.8)

In [None]:
model = RandomForestClassifier(max_depth=8, random_state=0)
model.fit(X_train, y_train)

In [None]:
res = model.predict(X_test)
acc = accuracy_score(y_test, res)
print(f"Model's accuracy: {acc}")

Model's accuracy: 0.865


# Alzheimer MRI classification model

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# importing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import cv2
import PIL
from PIL import Image
from torchvision import datasets, models, transforms
from torchvision.transforms import v2
import tqdm

print(torch.cuda.is_available())

False


In [None]:
category_classes = { # adding names for classes
    0: "Mild Demented",
    1: "Moderate Demented",
    2: "Non Demented",
    3: "Very Mild Demented",
}

path_to_data = "/content/drive/MyDrive/Alzheimer MRI Disease Classification Dataset/Data" # main path to the directory with dataset

In [None]:
# loading training data
df = pd.read_parquet(f"{path_to_data}/train-00000-of-00001-c08a401c53fe5312.parquet", engine="pyarrow")
df.head()

Unnamed: 0,image,label
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,2
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,3
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,3
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,2


In [None]:
# loading testing data
test = pd.read_parquet(f"{path_to_data}/test-00000-of-00001-44110b9df98c5585.parquet", engine="pyarrow")
test.head()

Unnamed: 0,image,label
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,3
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,2
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,3
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,0


In [None]:
def dict_to_image(img_dict): # decoding image from file
  b_string = img_dict['bytes']
  fbuf_img = np.frombuffer(b_string, np.uint8)
  img = cv2.imdecode(fbuf_img, cv2.IMREAD_GRAYSCALE)
  return img

In [None]:
# preproccesing training data
data['img_arr'] = data['image'].apply(dict_to_image)
data.drop('image', axis=1, inplace=True)
data.head()

Unnamed: 0,label,img_arr
0,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
# preproccesing testing data
test['img_arr'] = test['image'].apply(dict_to_image)
test.drop('image', axis=1, inplace=True)
test.head()

Unnamed: 0,label,img_arr
0,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,2,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,3,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
n_classes = len(category_classes) # num of classes

In [None]:
class ImageDataset(Dataset): # class for creating comfortable dataset
  def __init__(self, dataframe, transform=None):
    self.dataframe = dataframe
    self.transform = transform

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self, idx):
    image = self.dataframe.iloc[idx]["img_arr"]
    label = self.dataframe.iloc[idx]["label"]

    if self.transform:
      image = image.astype(np.uint8)
      image = self.transform(image)

    image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
    label = torch.tensor(label, dtype=torch.long)

    return image, label

In [None]:
class CNN(nn.Module): # CNN model
  def __init__(self):
    super(CNN, self).__init__()
    self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
    self.pool1 = nn.MaxPool2d(2, 2)
    self.batch_norm = nn.BatchNorm2d(num_features=32)
    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
    self.pool2 = nn.MaxPool2d(2, 2)
    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(64 * 32 * 32, 128)
    self.dropout = nn.Dropout(p=0.25)
    self.out = nn.Linear(128, n_classes)

  def forward(self, x):
    x = F.mish(self.conv1(x))
    x = self.pool1(x)
    x = self.batch_norm(x)
    x = F.mish(self.conv2(x))
    x = self.pool2(x)
    x = self.flatten(x)
    leakyReLU = nn.LeakyReLU(0.01)
    x = leakyReLU(self.fc1(x))
    x = self.dropout(x)
    x = self.out(x)
    return x

In [None]:
# image augmentation
transforms = v2.Compose([
    v2.RandomHorizontalFlip(p=0.3),
    v2.RandomVerticalFlip(p=0.3),
    v2.GaussianBlur(kernel_size=3),
    v2.RandomRotation(degrees=(-90, 90)),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485], std=[0.229])
])

In [None]:
# hyperparameters
learning_rate = 0.001
n_epochs = 10
batch_size = 32

In [None]:
# loading training dataset
train_dataset = ImageDataset(data, transforms)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

NameError: name 'data' is not defined

In [None]:
def train_model(model, loader, optimizer, n_epochs): # function for training model
  criterion = nn.CrossEntropyLoss()

  for epoch in range(n_epochs):
    full_losses = 0.0
    for data in tqdm.tqdm(loader):
      input, label = data[0], data[1]

      optimizer.zero_grad()
      output = model(input)
      loss = criterion(output, label)

      full_losses += loss.item()
      loss.backward()
      optimizer.step()

    epoch_loss = full_losses/len(loader)
    print(f"EPOCH: {epoch} | LOSS: {epoch_loss}")
  return model

In [None]:
model = CNN() # create model
optimizer = optim.AdamW(model.parameters(), lr=learning_rate) # loading optimizer
final_model = train_model(model, train_loader, optimizer, n_epochs) # training model

100%|██████████| 160/160 [07:57<00:00,  2.99s/it]


EPOCH: 0 | LOSS: 1.5275930032134055


100%|██████████| 160/160 [07:59<00:00,  3.00s/it]


EPOCH: 1 | LOSS: 0.7634654767811299


100%|██████████| 160/160 [07:22<00:00,  2.76s/it]


EPOCH: 2 | LOSS: 0.6281845793128014


100%|██████████| 160/160 [07:38<00:00,  2.87s/it]


EPOCH: 3 | LOSS: 0.46184935290366413


100%|██████████| 160/160 [07:20<00:00,  2.76s/it]


EPOCH: 4 | LOSS: 0.2962267100345343


100%|██████████| 160/160 [07:17<00:00,  2.74s/it]


EPOCH: 5 | LOSS: 0.2139977218117565


100%|██████████| 160/160 [07:41<00:00,  2.88s/it]


EPOCH: 6 | LOSS: 0.1322739682509564


100%|██████████| 160/160 [07:21<00:00,  2.76s/it]


EPOCH: 7 | LOSS: 0.10805716393515467


100%|██████████| 160/160 [07:42<00:00,  2.89s/it]


EPOCH: 8 | LOSS: 0.08518617613299284


100%|██████████| 160/160 [07:19<00:00,  2.75s/it]

EPOCH: 9 | LOSS: 0.059089872492768335





In [None]:
# loading testing dataset
test_dataset = ImageDataset(test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def predict_model(model, data_loader): # function for model prediction
  model.eval()

  pred_labels, correct_labels = [], []
  with torch.no_grad():
    for input, label in data_loader:
      output = model(input)
      _, pred = torch.max(output, 1)

      pred_labels.extend(pred)
      correct_labels.extend(label)
  return correct_labels, pred_labels

In [None]:
from sklearn.metrics import accuracy_score, f1_score

result = predict_model(final_model, test_loader) # testing model

acc = accuracy_score(result[0], result[1]) # accuracy: 95.5%
print("Accuracy:", acc)

f1_sc = f1_score(result[0], result[1], average='micro') # f1 score: 95.5%
print("F1-score:", f1_sc)

Accuracy: 0.95546875
F1-score: 0.95546875


Saving model

In [None]:
final_model = CNN()
torch.save(final_model, "mri_classification_model.v1.pth")