# Import Necessary Libraries 

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load The Data 

In [17]:
df = pd.read_csv(r'../data/v3_Feature_Selection.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [18]:
# Now we'll create a new feature to help us detect outliers in our data.
df['price_per_sqft'] = df['price']*100000/df['total_sqft']
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [19]:
# Now let's check how many locations in my dataset
df.location.unique()

<StringArray>
[                       'Electronic City Phase II',
                                'Chikka Tirupathi',
                                     'Uttarahalli',
                              'Lingadheeranahalli',
                                        'Kothanur',
                                      'Whitefield',
                                'Old Airport Road',
                                    'Rajaji Nagar',
                                    'Marathahalli',
                              '7th Phase JP Nagar',
 ...
                                   'Kavika Layout',
                                        'Zuzuvadi',
                           'Kanakapura main  Road',
                                   'Sindhi Colony',
                             'Kanakapur main road',
                       'Prasanna layout Herohalli',
                               'Sarvobhogam Nagar',
 '12th cross srinivas nagar banshankari 3rd stage',
                               'Havanur exten

In [20]:
len(df.location.unique())

1215

In [21]:
# Let's see how many data points are available in location
df.location = df.location.apply(lambda x: x.strip())

location_stage = df.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stage

location
Whitefield                     520
Sarjapur  Road                 373
Electronic City                281
Kanakpura Road                 245
Thanisandra                    230
                              ... 
Kannur                           1
Karnataka Shabarimala            1
Kasthuri Nagar East Of NGEF      1
Kathreguppe                      1
whitefiled                       1
Name: location, Length: 1205, dtype: int64

In [22]:
# as we can see here Whitefield has 520 location 
# Now let's see how many locations less than 10 data points
len(location_stage[location_stage<=10])

991

In [23]:
location_stage_less_then_10 = location_stage[location_stage <= 10]
location_stage_less_then_10

location
1st Block Jayanagar            10
Dodsworth Layout               10
Kalkere                        10
Tindlu                         10
Nagappa Reddy Layout           10
                               ..
Kannur                          1
Karnataka Shabarimala           1
Kasthuri Nagar East Of NGEF     1
Kathreguppe                     1
whitefiled                      1
Name: location, Length: 991, dtype: int64

In [24]:
len(df.location.unique())

1205

In [25]:
df.location = df.location.apply(lambda x: 'other' if x in location_stage_less_then_10 else x)
len(df.location.unique())

215

In [26]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


# Save The FE data 

In [27]:
df.to_csv('../data/v4_Feature_Engineering.csv', index=False)

### This is for ```src/components/d_feature_engineering.py```

In [28]:
def feature_engineering(df):
    """
    This function performs feature engineering on the input DataFrame by creating new features
    and modifying existing ones to enhance the dataset for analysis or modeling.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing real estate data.

    Returns:
    pd.DataFrame: The modified DataFrame with new features.
    """
    # Create a new feature 'price_per_sqft'
    df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

    # Standardize location names by stripping whitespace
    df.location = df.location.apply(lambda x: x.strip())

    # Group by location to count occurrences
    location_stage = df.groupby('location')['location'].agg('count').sort_values(ascending=False)

    # Identify locations with 10 or fewer data points
    location_stage_less_then_10 = location_stage[location_stage <= 10]

    # Replace less frequent locations with 'other'
    df.location = df.location.apply(lambda x: 'other' if x in location_stage_less_then_10 else x)

    return df