In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/atharvabhide/Quora-Question-Pair-Similarity/refs/heads/main/notebooks/train.csv")
df.sample(5)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
113343,113343,10535,94794,What are Some of the best gadgets on 2016?,What is the best gadget of 2016?,1
161464,161464,133145,251599,Which are the best colleges for electronics an...,What are the best college in india for electro...,0
217132,217132,71143,151932,What is the main threat for accessing the deep...,Is it safe to access the deep web?,1
285029,285029,405385,199164,What is the reason for price increase of a pro...,When are oil prices expected to rise again?,0
306009,306009,429463,429464,Why do people think that Hinduism is a pagan r...,Why is Hinduism a pagan religion?,1


## MACHINE LEARNING MODELS: -

#### üîπ Classical Models

- Logistic Regression

- K-Nearest Neighbors (KNN)

- Support Vector Classifier (SVC)

- Decision Tree

- Naive Bayes

#### üîπ Ensemble Models

- Random Forest

- Extra Trees

- Bagging Classifier

- AdaBoost

- Gradient Boosting

- XGBoost

- LightGBM

- CatBoost

#### üîπ Meta-Ensemble Models

- Voting Classifier

- Stacking Classifier

## DATA

In [16]:
import pandas as pd 
import numpy as np

data = {
    "area": [1400, 1600, 1700, 1875, 1100, 1550],
    "price": [245000, 312000, 279000, 308000, 199000, 219000]
}
df = pd.DataFrame(data)

X = df[["area"]]
y = df['price']


## MODEL - REGRESSION

### üîπ **Mean Squared Error (MSE)**

* Yes ‚Äî the **smaller the MSE, the better** the model is fitting the data.
* A **large MSE** means predictions are far from actual values.
* But keep in mind: ‚Äúlarge‚Äù or ‚Äúsmall‚Äù is **relative to your target values‚Äô scale**.

  * Example: if house prices are in **millions**, then an MSE in billions might still be okay.
  * That‚Äôs why people often look at **RMSE** (square root of MSE), since it‚Äôs in the same units as the target.

---

### üîπ **R¬≤ Score**

* Correct: **higher is better**.
* **R¬≤ = 1** ‚Üí perfect fit.
* **R¬≤ = 0** ‚Üí model is no better than predicting the mean of the target.
* **R¬≤ < 0** ‚Üí model is doing worse than just predicting the mean (bad performance).

#### Small clarification:

* R¬≤ doesn‚Äôt have to always be **positive**.
* If it‚Äôs negative ‚Üí model is **underperforming** badly.
* So:

  * Good model ‚Üí R¬≤ close to 1.
  * Okay model ‚Üí R¬≤ between 0 and 1.
  * Bad model ‚Üí R¬≤ < 0.

---

#### **So yes, your takeaway is correct:**

* **MSE should be as small as possible**.
* **R¬≤ should be as close to 1 as possible** (positive and high).



In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)

model.coef_
model.intercept_


34066.66666666677

In [19]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = DecisionTreeRegressor(max_depth=5, min_samples_split=10, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(mean_squared_error(y_test, y_pred))

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

print(r2_score(y_test, y_pred))


1864812500.0
43183.47484860384
-0.661672978391624


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = SVR(kernel='rbf', random_state=42) # linear, poly, rbf
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = GradientBoostingRegressor(n_estimator=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = AdaBoostRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_arror, r2_score

model = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = LGMRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = CatBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)


## DATA

In [23]:
data = {
    "age": [22, 25, 47, 52, 46, 56, 55, 60],
    "salary": [25000, 32000, 47000, 60000, 42000, 52000, 58000, 72000],
    "purchased": [0, 0, 1, 1, 1, 1, 1, 1]  
}
df = pd.DataFrame(data)

X = df[["age", "salary"]]
y = df["purchased"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


## MODEL - CLASSIFICATION

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.tree import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimator=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = SVC(kernel="rbf", probability=True, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = KNeighborsClassifier(n_neighbors=3, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = GradientBoostingClassifier(n_estimator=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = AdaBoostClassifier(n_estimator=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(n_estimator=200, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = LGBMClassifier(n_estimator=200, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = CatBoostClassifier(iteration=200, learning_rate=0.1, depth=3, verbose=0, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = ExtraTreesClassifier(n_estimators=200, max_depth=None, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

model = ExtraTreesClassifier(n_estimators=200, max_depth=None, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier(random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=3)

model = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('knn', knn_clf)],
    voting='hard'   # 'hard' = majority vote, 'soft' = average predicted probabilities
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

estimators = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]


final_estimator = LogisticRegression()


model = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

base_model = DecisionTreeClassifier(random_state=42)


model = BaggingClassifier(
    base_estimator=base_model,
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
classification_report(y_test, y_pred)


In [6]:
class Person:

    def __init__(self, name, population):
        self.name = name
        self.population = population


    def get_population(self):
        return self.population

# Usage
p1 = Person("Alice", 20)
p2 = Person("Bob", 30)

print(p1.get_population())
print(p2.get_population())


20
30


In [13]:
class Person:
    global_p = 0

    def __init__(self, name, population):
        self.name = name
        self.population = population

    @property
    def get_population(self):
        return self.population

    @classmethod
    def gp(cls):
        cls.global_p +=1


# Usage
p1 = Person("Alice", 20)
p2 = Person("Bob", 30)

print(p1.get_population)
print(p2.get_population)

print(p1.global_p)
print(p2.global_p)

p1.gp()

print(p1.get_population)
print(p2.get_population)

print(p1.global_p)
print(p2.global_p)

Person.gp()

print(p1.get_population)
print(p2.get_population)

print(p1.global_p)
print(p2.global_p)


20
30
0
0
20
30
1
1
20
30
2
2
