In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv("Predict-The-Data-Scientists-Salary-In-India_Train_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,experience,job_description,job_desig,job_type,key_skills,location,salary,company_name_encoded
0,0,5-7 yrs,Exp: Minimum 5 years;Good understanding of IOC...,Senior Exploit and Vulnerability Researcher,,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),6to10,3687
1,1,10-17 yrs,He should have handled a team of atleast 5-6 d...,Head SCM,,"ppc, logistics, inventory management, supply c...",Sonepat,10to15,458
2,2,5-9 yrs,Must be an effective communicator (written & s...,Deputy Manager - Talent Management & Leadershi...,Analytics,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,15to25,4195
3,3,7-10 yrs,7 - 10 years of overall experience in data e...,Associate Manager Data Engineering,Analytics,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,10to15,313
4,4,1-3 yrs,Chartered Accountancy degree or MBA in Finance...,TS- GSA- Senior Analyst,,"accounting, finance, cash flow, financial plan...",Gurgaon,3to6,1305


In [4]:
df[['min_exp', 'max_exp']] = df['experience'].str.extract(r'(\d+)-(\d+)').astype(float)


In [5]:
df['min_exp'].fillna(df['min_exp'].median(), inplace=True)
df['max_exp'].fillna(df['max_exp'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['min_exp'].fillna(df['min_exp'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['max_exp'].fillna(df['max_exp'].median(), inplace=True)


In [6]:
df.drop(['company_name_encoded', 'experience', 'job_description', 'key_skills'], axis=1, inplace=True)

In [7]:
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))

In [8]:
X = df.drop('salary', axis=1)
y = df['salary']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
models = {
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100)
}

In [11]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))


DecisionTree Accuracy: 0.3229
              precision    recall  f1-score   support

           0       0.50      0.51      0.50       609
           1       0.29      0.27      0.28       944
           2       0.31      0.33      0.32       806
           3       0.35      0.36      0.36       300
           4       0.29      0.28      0.29       591
           5       0.24      0.24      0.24       711

    accuracy                           0.32      3961
   macro avg       0.33      0.33      0.33      3961
weighted avg       0.32      0.32      0.32      3961


RandomForest Accuracy: 0.3633
              precision    recall  f1-score   support

           0       0.55      0.65      0.59       609
           1       0.31      0.32      0.31       944
           2       0.34      0.39      0.36       806
           3       0.45      0.41      0.43       300
           4       0.33      0.25      0.28       591
           5       0.26      0.23      0.24       711

    accuracy   

In [12]:
best_model = max(results, key=results.get)
print(f"\n Best performing model: {best_model} with accuracy: {results[best_model]:.4f}")


 Best performing model: GradientBoosting with accuracy: 0.4014


In [13]:
from tpot import TPOTClassifier

In [14]:
df = pd.read_csv("titanic_train.csv")

In [15]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
df.drop(['name', 'ticket', 'cabin'], axis=1, inplace=True)

In [17]:
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


In [18]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])       # male=1, female=0
df['embarked'] = le.fit_transform(df['embarked'])

In [21]:
df.rename(columns={'survived': 'class'}, inplace=True)

In [22]:
X = df.drop('class', axis=1)
y = df['class']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
tpot = TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42)
tpot.fit(X_train, y_train)

is_classifier
is_regressor




is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier


Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


                                                                        
Generation 1 - Current best internal CV score: 0.8145671230178273
                                                                        
Generation 2 - Current best internal CV score: 0.8202403230572244
                                                                        
Generation 3 - Current best internal CV score: 0.8202403230572244
                                                                        
Generation 4 - Current best internal CV score: 0.8202403230572244
                                                                        
Generation 5 - Current best internal CV score: 0.8244361272530287
                                                                        
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8, min_samples_leaf=4, min_samples_split=5, n_estimators=100)


In [25]:
score = tpot.score(X_test, y_test)
print(f"\n TPOT Best Model Test Score: {score:.4f}")


 TPOT Best Model Test Score: 0.8101


In [26]:
tpot.export('tpot_best_pipeline_titanic.py')