In [None]:
import pandas as pd
import numpy as np
import re

# ---------------------------------------------------------
# READ THE FILE (YOUR REQUIRED WAY)
# ---------------------------------------------------------
df = pd.read_csv("pune_commercial_listings_nobroker.csv")  
print("Original shape:", df.shape)

clean = df.copy()

# ---------------------------------------------------------
# CLEANING LOGIC (UNCHANGED)
# ---------------------------------------------------------

def parse_deposit(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().strip('"').strip("'")
    if s.lower() in ['no deposit','no','-','na','n/a','none']:
        return 0
    s2 = s.replace(",", "")
    nums = re.findall(r'\d+', s2)
    if not nums:
        return np.nan
    return int("".join(nums))

def normalize_property_type(x):
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip().lower()
    if "office" in s:
        return "Office"
    if "shop" in s:
        return "Shop"
    if "warehouse" in s or "godown" in s:
        return "Warehouse"
    if s in ["","nan","na"]:
        return "Unknown"
    return s.title()

def norm_parking(x):
    if pd.isna(x):
        return "Not Available"
    s = str(x).strip().lower()
    if "no" in s and "parking" in s:
        return "Not Available"
    if s in ["no","na","-","none"]:
        return "Not Available"
    if "parking" in s or "slot" in s or "available" in s:
        return "Available"
    return s.title()

def norm_furnishing(x):
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip().lower()
    if "semi" in s and "furnish" in s:
        return "Semi-Furnished"
    if "un" in s and "furnish" in s:
        return "Unfurnished"
    if "furnish" in s:
        return "Furnished"
    if s in ["unfurnished", "semifurnished", "semi-furnished"]:
        if "semi" in s:
            return "Semi-Furnished"
        return "Unfurnished"
    return s.title()


# ---------------------------------------------------------
# APPLY CLEANING
# ---------------------------------------------------------

if "Deposit_raw" in clean.columns:
    clean["deposit"] = clean["Deposit_raw"].apply(parse_deposit)
else:
    clean["deposit"] = np.nan

rename_map = {
    "Locality": "locality",
    "Location": "location_full",
    "Rent_inr": "rent",
    "Sqft": "sqft",
    "Price_per_sqft": "price_per_sqft",
    "Available_from": "available_from",
    "Property_type": "property_type",
    "Parking": "parking",
    "Furnishing": "furnishing"
}

clean = clean.rename(columns={k: v for k, v in rename_map.items() if k in clean.columns})

cols_to_drop = ["Title","Detail_URL","Rent_raw","Sqft_raw","Deposit_raw"]
clean = clean.drop(columns=[c for c in cols_to_drop if c in clean.columns], errors="ignore")

# numeric conversions
clean["rent"] = pd.to_numeric(clean.get("rent"), errors="coerce").fillna(0).astype(int)
clean["sqft"] = pd.to_numeric(clean.get("sqft"), errors="coerce").fillna(0).astype(int)
clean["price_per_sqft"] = pd.to_numeric(clean.get("price_per_sqft"), errors="coerce")

mask_pps = clean["price_per_sqft"].isna() | (clean["price_per_sqft"] == 0)
cond = mask_pps & (clean["sqft"] > 0)
clean.loc[cond, "price_per_sqft"] = clean.loc[cond, "rent"] / clean.loc[cond, "sqft"]

clean["deposit"] = pd.to_numeric(clean["deposit"], errors="coerce").fillna(0).astype(int)

# normalize categories
clean["property_type"] = clean["property_type"].apply(normalize_property_type)
clean["parking"] = clean["parking"].apply(norm_parking)
clean["furnishing"] = clean["furnishing"].apply(norm_furnishing)

clean["locality"] = clean["locality"].astype(str).str.strip()
clean["locality"] = clean["locality"].apply(lambda x: x.split(",")[0] if isinstance(x,str) else x)

clean["location_full"] = clean["location_full"].astype(str).str.strip()
clean["available_from"] = pd.to_datetime(clean["available_from"], dayfirst=True, errors="coerce")

# reorder columns
desired = [
    "locality","location_full","property_type","sqft","rent","deposit",
    "price_per_sqft","parking","furnishing","available_from"
]
final_cols = [c for c in desired if c in clean.columns]
extra_cols = [c for c in clean.columns if c not in final_cols]
clean = clean[final_cols + extra_cols]

# ensure >400 rows
strict_mask = (clean["sqft"] > 0) & (clean["rent"] > 0)
clean_strict = clean[strict_mask]

if len(clean_strict) >= 400:
    clean = clean_strict
else:
    relaxed_mask = (clean["sqft"] > 0) | (clean["rent"] > 0)
    clean_relaxed = clean[relaxed_mask]
    if len(clean_relaxed) >= 400:
        clean = clean_relaxed

clean = clean.reset_index(drop=True)


# ---------------------------------------------------------
# SAVE OUTPUT (YOUR REQUIRED WAY)
# ---------------------------------------------------------
clean.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Cleaning complete.")
print("Final shape:", clean.shape)


### Cheking if there any null values present in the datset??

In [3]:
import pandas as pd
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

In [None]:
df.isnull().sum()

In [None]:
df.isnull().values.any()

In [None]:
df[df.isnull().any(axis=1)]

--------------------------
-------------------------

### Checking for duplicate values

In [17]:
df.duplicated().sum()

np.int64(0)

### Removing the duplicate values

In [16]:
df = df.drop_duplicates()

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, 0 to 591
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   locality        569 non-null    object 
 1   location_full   569 non-null    object 
 2   property_type   569 non-null    object 
 3   sqft            569 non-null    int64  
 4   rent            569 non-null    int64  
 5   deposit         569 non-null    int64  
 6   price_per_sqft  569 non-null    float64
 7   parking         569 non-null    object 
 8   furnishing      569 non-null    object 
 9   available_from  569 non-null    object 
dtypes: float64(1), int64(3), object(6)
memory usage: 48.9+ KB


--------------------
-------------------

### Addressing the mistyoped entries in the column deposit

In [19]:
# Identify unrealistic deposits (those less than 100)
mask_wrong_deposit = df['deposit'] < 100

# Apply deposit correction: deposit = 3 × rent
df.loc[mask_wrong_deposit, 'deposit'] = df.loc[mask_wrong_deposit, 'rent'] * 3

# Save changes in same CSV (overwrite existing cleaned file)
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Deposit correction applied successfully! Updated file saved.")

Deposit correction applied successfully! Updated file saved.


--------------
-------------

### Cleaning the sqft column

In [20]:
# Identify invalid sqft entries
invalid_sqft = df[df['sqft'] < 100]
print("Invalid sqft rows found:", len(invalid_sqft))

# Drop them
df = df[df['sqft'] >= 100]

# Save cleaned file
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("All invalid sqft entries removed successfully!")

Invalid sqft rows found: 14
All invalid sqft entries removed successfully!


-----------
----------

### Cleaning Deposit column 

In [6]:
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

In [8]:
(df['deposit'] < 1000).sum()

np.int64(0)

In [9]:
df['deposit'].describe()

count    5.550000e+02
mean     1.666761e+05
std      3.448732e+05
min      5.625000e+03
25%      5.000000e+04
50%      9.000000e+04
75%      1.800000e+05
max      5.463000e+06
Name: deposit, dtype: float64

In [7]:
# Mark unrealistic deposits (deposit < 1000)
mask_bad_deposit = df['deposit'] < 1000

# Replace them using rule: deposit = 3 × rent
df.loc[mask_bad_deposit, 'deposit'] = df.loc[mask_bad_deposit, 'rent'] * 3

# Save changes back into same file
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Deposit column cleaned successfully!")
print("Number of corrected deposit values:", mask_bad_deposit.sum())

Deposit column cleaned successfully!
Number of corrected deposit values: 0


-------------
-------------

### Calculating the rent_per_sqft

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

# Ensure no zero or negative sqft values
df = df[df["sqft"] > 0]

# Calculate rent per sqft
df["price_per_sqft"] = (df["rent"] / df["sqft"]).round(2)

# Save back to the same file
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("price_per_sqft column calculated safely and saved successfully!")

price_per_sqft column calculated safely and saved successfully!


--------
-------

In [2]:
df['property_type'].value_counts()

property_type
Office        325
Shop          170
Warehouse      23
Commercial     18
Unknown        11
Showroom        5
Restaurant      3
Name: count, dtype: int64

---------------
-----------------

### Addressing the unknown columns

In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

# Get indices of Unknown property types
unknown_idx = df.index[df["property_type"] == "Unknown"].tolist()

# Ensure there are exactly 11 unknowns
if len(unknown_idx) != 11:
    print(f"Warning: Expected 11 Unknown rows, found {len(unknown_idx)}")
    
# --- Proportional Assignment ---
# Based on your distribution:
# Office: 6, Shop: 3, Commercial: 1, Warehouse: 1

assign_map = {
    "Office": unknown_idx[:6],
    "Shop": unknown_idx[6:9],
    "Commercial": unknown_idx[9:10],
    "Warehouse": unknown_idx[10:11]
}

# Apply assignments
for prop_type, idx_list in assign_map.items():
    df.loc[idx_list, "property_type"] = prop_type

# Save cleaned file
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Unknown property types replaced proportionally successfully!")
print("\nUpdated Distribution:")
print(df["property_type"].value_counts())

Unknown property types replaced proportionally successfully!

Updated Distribution:
property_type
Office        331
Shop          173
Warehouse      24
Commercial     19
Showroom        5
Restaurant      3
Name: count, dtype: int64


------
-------

### Drop the column parking

In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

# Drop the parking column only if it exists
if "parking" in df.columns:
    df = df.drop(columns=["parking"])

# Save updated file
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Column 'parking' dropped successfully!")

Column 'parking' dropped successfully!


------
-----

### available_from droping the column

In [7]:
import pandas as pd

# Load dataset
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

# Drop the column only if present
if "available_from" in df.columns:
    df = df.drop(columns=["available_from"])

# Save cleaned dataset
df.to_csv("pune_commercial_listings_cleaned.csv", index=False)

print("Column 'available_from' dropped successfully!")

Column 'available_from' dropped successfully!


-----------
-------

### Detecting the Outlier

In [8]:
import pandas as pd

# Load dataset
df = pd.read_csv("pune_commercial_listings_cleaned.csv")

# Numeric columns to check
num_cols = ["rent", "sqft", "price_per_sqft", "deposit"]

# Dictionary to store results
iqr_results = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Detect outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    # Save results
    iqr_results[col] = {
        "Q1": round(Q1, 2),
        "Q3": round(Q3, 2),
        "IQR": round(IQR, 2),
        "Lower Bound": round(lower_bound, 2),
        "Upper Bound": round(upper_bound, 2),
        "Outlier Count": len(outliers)
    }

# Display summary
for col, stats in iqr_results.items():
    print(f"\n==== Outlier Summary for {col} ====")
    for key, value in stats.items():
        print(f"{key}: {value}")



==== Outlier Summary for rent ====
Q1: 18000.0
Q3: 60000.0
IQR: 42000.0
Lower Bound: -45000.0
Upper Bound: 123000.0
Outlier Count: 41

==== Outlier Summary for sqft ====
Q1: 220.0
Q3: 550.0
IQR: 330.0
Lower Bound: -275.0
Upper Bound: 1045.0
Outlier Count: 0

==== Outlier Summary for price_per_sqft ====
Q1: 55.7
Q3: 133.33
IQR: 77.64
Lower Bound: -60.76
Upper Bound: 249.78
Outlier Count: 73

==== Outlier Summary for deposit ====
Q1: 50000.0
Q3: 180000.0
IQR: 130000.0
Lower Bound: -145000.0
Upper Bound: 375000.0
Outlier Count: 40
