In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/hotel-bookingcvs/hotel_booking.csv')

In [None]:
data.head()

In [None]:
data.tail()

In [None]:

data.sample(10)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
for column in data.select_dtypes(include=['category', 'object']).columns:
    print(f"Column: {column}")
    print(data[column].value_counts())
    print("\n")

In [None]:
duplicated_rows = data[data.duplicated()]

In [None]:
duplicated_rows

In [None]:
null_values = data.isnull().sum()

In [None]:
null_values

In [None]:

data['children'].fillna(data['children'].mode()[0], inplace=True)
data.isnull().sum()

In [None]:
data.to_json('data.json', orient='records', lines=True)

In [None]:
# Impute missing values in the 'country' column with the most frequent value (mode)
data['country'].fillna(data['country'].mode()[0], inplace=True)
data.isnull().sum()

In [None]:
# Impute missing values in the 'agent' column with "Unknown"
data['agent'].fillna('Unknown', inplace=True)
data.isnull().sum()

In [None]:
# Remove the 'company' column from the data
if 'company' in data.columns:
    data = data.drop('company', axis=1)
else:
    print("Column 'company' not found in DataFrame.")

# Display DataFrame information
data.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoded_df = data.copy()
target_column = 'is_canceled'

categorical_columns = data.select_dtypes(include=['category', 'object']).columns
categorical_columns = categorical_columns[categorical_columns != target_column]


encoded_df[categorical_columns] = encoded_df[categorical_columns].astype(str)


label_encoder = LabelEncoder()

for col in categorical_columns:
    encoded_df[col] = label_encoder.fit_transform(encoded_df[col])


encoded_df.head()

---

In [None]:
X = encoded_df.drop(target_column, axis=1)
y = encoded_df[target_column]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
logistic_model = LogisticRegression(max_iter=1000)

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
y_pred = logistic_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
y_proba = logistic_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve for Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()