In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('week 1/first inten project.csv')
print("Dataset shape:", df.shape)


Dataset shape: (36285, 17)


In [23]:
#First rows
df.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


In [24]:
print("Null values:")
print(df.isnull().sum())

Null values:
Booking_ID                  0
number of adults            0
number of children          0
number of weekend nights    0
number of week nights       0
type of meal                0
car parking space           0
room type                   0
lead time                   0
market segment type         0
repeated                    0
P-C                         0
P-not-C                     0
average price               0
special requests            0
date of reservation         0
booking status              0
dtype: int64


In [25]:
print("Data types:")
print(df.dtypes)

Data types:
Booking_ID                   object
number of adults              int64
number of children            int64
number of weekend nights      int64
number of week nights         int64
type of meal                 object
car parking space             int64
room type                    object
lead time                     int64
market segment type          object
repeated                      int64
P-C                           int64
P-not-C                       int64
average price               float64
special requests              int64
date of reservation          object
booking status               object
dtype: object


In [26]:
# Remove whitespace from string columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip()



In [27]:
df.describe()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,special requests
count,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0,36285.0
mean,1.844839,0.10536,0.810693,2.204602,0.030977,85.239851,0.02563,0.023343,0.153369,103.421636,0.619733
std,0.518813,0.402704,0.87059,1.410946,0.173258,85.938796,0.158032,0.368281,1.753931,35.086469,0.786262
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,1.0,13.0,58.0,540.0,5.0


In [28]:
# 2. Outlier handling using IQR
numeric_cols = df.select_dtypes(include=[np.number]).columns

print("Before outlier removal:", df.shape)

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[col] < lower_bound) | (df[col] > upper_bound))
    print(f"{col}: {outliers.sum()} outliers")
    
    # Remove outliers
    df = df[~outliers]

print("After outlier removal:", df.shape)
df = df.reset_index(drop=True)


Before outlier removal: (36285, 17)
number of adults: 10175 outliers
number of children: 2390 outliers
number of weekend nights: 7 outliers
number of week nights: 164 outliers
car parking space: 650 outliers
lead time: 623 outliers
repeated: 180 outliers
P-C: 0 outliers
P-not-C: 0 outliers
average price : 531 outliers
special requests: 376 outliers
After outlier removal: (21189, 17)


In [29]:
# 3. Feature Engineering


target_col = "booking status"
print(f"Target unique values: {df[target_col].unique()}")

# define x and y 
X = df.drop(target_col, axis=1)
y = df[target_col]

print(f"Features: {X.columns.tolist()}")
print(f"Feature shape: {X.shape}")


Target unique values: ['Canceled' 'Not_Canceled']
Features: ['Booking_ID', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'type of meal', 'car parking space', 'room type', 'lead time', 'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price ', 'special requests', 'date of reservation']
Feature shape: (21189, 16)


In [30]:
# 4. Categorical Data Transformation
categorical_cols = X.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_cols.tolist()}")

# Encode categorical variables
encoded_cols = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoded_cols[col] = le


target_le = LabelEncoder()
y = target_le.fit_transform(y)

print("Categorical encoding completed")

Categorical columns: ['Booking_ID', 'type of meal', 'room type', 'market segment type', 'date of reservation']
Categorical encoding completed


In [31]:
# 5. Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training  shape: {X_train.shape}")
print(f"Test  shape: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



Training  shape: (16951, 16)
Test  shape: (4238, 16)


In [32]:
# Modeling and Accuracy Calculation
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")


Random Forest Accuracy: 0.8747


In [33]:
# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")


Logistic Regression Accuracy: 0.8006


In [34]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))



Top 10 Important Features:
                     feature  importance
8                  lead time    0.331561
13            average price     0.156921
15       date of reservation    0.141312
0                 Booking_ID    0.115802
14          special requests    0.096902
4      number of week nights    0.045909
9        market segment type    0.043246
3   number of weekend nights    0.033990
5               type of meal    0.019510
7                  room type    0.014847
