
### You are given a dataset – “hotel_bookings.csv.” The dataset has a high number of null and  elements that need to be cleansed;  Your job is to create a separate DataFrame with only categorical columns and  perform the  following operations:
### 1.	Find the number of null values in each column of the new DataFrame
### 2.	Replace the null values with mode 
### 3.	In the "hotel" column, replace the hotel names with "0" and "1" based on the condition that – if, "hotel" = "city_hotel", then "hotel" = "1";  else, "0"
### 4.	Using the label encoder, assign a unique country code to each country
### 5.	Using onehot encoder, encode the “month” column

In [None]:
import numpy as np
import pandas as pd
import sklearn

hotel_book= pd.read_csv('hotel_bookings.csv') 
hotel_book

In [None]:
import numpy as np
import pandas as pd
import sklearn

hotel_book= pd.read_csv('hotel_bookings.csv') 
hotel_book

hotel_book.head(5)
hotel_book.tail(5)
hotel_book.sample(5)
hotel_book.describe()

hotel_book.isnull()
hotel_book.isnull().any()


#Find the number of null values in each column of the new DataFrame
hotel_book.isnull().sum()
hotel_book=hotel_book.drop(['company','agent'],axis=1)
hotel_book


#Replace the null values with mode
hotel_book=hotel_book.fillna(hotel_book['country'].value_counts().index[0])
hotel_book
hotel_book.isnull().sum()


# In the "hotel" column, replace the hotel names with "0" and "1" based on the condition that 
# – if, "hotel" = "city_hotel", then "hotel" = "1"; else, "0"
hotel_book['hotel']=np.where(hotel_book['hotel'].str.contains('City Hotel'),1,0)
hotel_book
hotel_book.sample(5)


# Using the label encoder, assign a unique country code to each country
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
hotel_book['country code']=LE.fit_transform(hotel_book['country'])
hotel_book

# OR

LE=LabelEncoder()
hotel_book['country']=LE.fit_transform(hotel_book['country'])
hotel_book

hotel_book.sample(5)


# Using onehot encoder, encode the “month” column
# from sklearn.preprocessing import OneHotEncoder
# OHE=OneHotEncoder()
# hotel_book['month']=OHE.fit_transform(hotel_book['arrival_date_month'])
# hotel_book

# VALUE ERROR: Expected 2D array, got 1D array instead
# SOLUTION: 
#OHE = OneHotEncoder(sparse=False, drop=None)
#sparse=False: This ensures the output is a dense NumPy array, not a sparse matrix.
             # It allows easy conversion into a Pandas DataFrame.
# drop=None: This means no category will be dropped, so you'll get one column for each unique category in your input.
             #Use drop='first' if you want to drop the first category to avoid multicollinearity in regression models.

# Reshape to 2D array
month_encoded = OHE.fit_transform(hotel_book[['arrival_date_month']])
# Convert to DataFrame with appropriate column names
month_df = pd.DataFrame(month_encoded, columns=OHE.get_feature_names_out(['arrival_date_month']))
# Concatenate with original DataFrame
hotel_book = pd.concat([hotel_book.drop('arrival_date_month', axis=1), month_df], axis=1)



# USING RESHAPE
# Convert column to NumPy array and reshape to 2D
month_array = hotel_book['arrival_date_month'].to_numpy().reshape(-1, 1)
# Fit and transform the reshaped array
month_encoded = OHE.fit_transform(month_array)
# Create DataFrame from encoded array
month_df = pd.DataFrame(month_encoded, columns=OHE.get_feature_names_out(['arrival_date_month']))
#Align indexes to avoid issues during concatenation
month_df.index = hotel_book.index
#Replace original column with new encoded columns
hotel_book = pd.concat([hotel_book.drop('arrival_date_month', axis=1), month_df], axis=1)



# USING get_dummies()

hotel_book=pd.DataFrame(hotel_book)
hotel_book
hotel_book = pd.get_dummies(hotel_book, columns=['arrival_date_month'], prefix='month', drop_first=False)
print(hotel_book)

## Scaling the Data

### 1. Min-Max Scalar

#### USing Library

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()

# minmax = MinMaxScalar(feature_range=(10, 20))

In [None]:
# column data
x=hotel_book['lead_time']
x

In [None]:
 hotel_book['lead_time']= minmax.fit_transform(hotel_book[['lead_time']])

In [None]:
hotel_book

#### Using Scrach [X= (X - X_min)/ (X_max - X_min)]

In [None]:
lt=hotel_book['lead_time']
lt

In [None]:
hotel_book['lead_time']=(lt - lt.min()) / (lt.max() - lt.min())

In [None]:
hotel_book

In [None]:
hotel_book1 = hotel_book.copy()

min_val = hotel_book['lead_time'].min()
max_val = hotel_book['lead_time'].max()

hotel_book1['lead_time'] = (hotel_book['lead_time'] - min_val) / (max_val - min_val)

In [None]:
hotel_book1 = hotel_book.copy()

x = ['lead_time']

for col in x:
    min_val = hotel_book[col].min()
    max_val = hotel_book[col].max()
    hotel_book1[col + '_scaled'] = (hotel_book[col] - min_val) / (max_val - min_val)

print(hotel_book)

### 2. Standard Scalar

#### using library

In [None]:
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
hotel_book['lead_time'] = ss.fit_transform(hotel_book[['lead_time']])
print(hotel_book['lead_time'])

#### using Scratch 

#### X = X - mean / sd

In [None]:
lt=hotel_book['lead_time']
lt
mean = lt.mean()
std = lt.std()
hotel_book['lead_time'] = (lt - mean) / std
print(hotel_book)

### 3. Robust Scaling

#### Using Library

In [None]:
from sklearn.preprocessing import RobustScaler

robust = RobustScaler()
hotel_book['lead_time'] = robust.fit_transform(hotel_book[['lead_time']])
print(hotel_book['lead_time'] )

#### Using Scratch

#### X = (X - median) / IQR

#### IQR= Q_3 - Q_1

In [None]:
lt=hotel_book['lead_time']
lt

q1 = lt.quantile(0.25)
q3 = lt.quantile(0.75)
iqr = q3 - q1
median = lt.median()
hotel_book['lead_time'] = (lt - median) / iqr
print(hotel_book['lead_time'] )

### Normlization

In [None]:
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np

In [None]:
lt=hotel_book[['lead_time']].values
lt

In [None]:
norm=normalize(lt)

hotel_book['lead_time']=norm

print(hotel_book)

#### Normalization (specifically L2 normalization) scales the values so that the
#### entire row (or feature vector) has a unit norm (length = 1)


#### X_norm = X/sqrt(X^2)

In [None]:
hotel_book1 = hotel_book.copy()

l2_norm = np.sqrt((hotel_book1['lead_time']**2).sum())

In [None]:
hotel_book1['lead_time'] = hotel_book1['lead_time'] / l2_norm
print(hotel_book)

### You are given a dataset – “Heart_Disease.csv.” The dataset has a high number of null and elements that need to be cleansed;
### 1. Find the number of null values in each column of the new DataFrame.
### 2. Perform feature scaling on the numerical columns of the dataset using the following techniques:
### (min max scalar, standard scalar, Robust Scalar)
### 3. Use Data Normalization