In [1]:
import pandas as pd
from scipy.stats import skew, kurtosis, mode
from google.colab import files

In [3]:
# Load the cleaned merged dataset
df = pd.read_csv("/content/data/rent_prediction.csv")

In [4]:
df

Unnamed: 0,RegionName,StateName,Date,TopTier,MiddleTier,BottomTier,SingleFamily,Condo,1BHK,2BHK,...,MultifamilyRent,SingleFamilyRent,SeasonalAllHomesRent,SeasonalMultifamilyRent,SeasonalSingleFamilyRent,ZORDI,PctSoldBelowList,NewConSalePrice,NewConSalePriceSqFt,MedianListPrice
0,United States,,2020-06-30,5.102761e+05,260439.227330,133858.894988,256686.462157,266377.764120,187298.758305,190563.902908,...,1439.605030,1634.229137,1514.534958,1437.076721,1633.171753,100.0,0.539669,328990.0,152.461796,312900.0
1,"Los Angeles, CA",CA,2020-06-30,1.162437e+06,692709.155101,482112.916477,722666.052404,523499.776213,445203.481299,550611.075757,...,2174.202572,3229.571802,2320.978200,2183.842961,3232.197550,134.0,0.501218,1098253.0,607.024834,819000.0
2,"Chicago, IL",IL,2020-06-30,4.229209e+05,250456.136880,149956.832438,257755.873117,206966.438442,160923.952997,180026.944146,...,1672.908263,1824.532020,1677.557975,1658.497632,1818.115553,97.0,0.697391,482860.0,181.670786,305000.0
3,"Dallas, TX",TX,2020-06-30,4.371268e+05,271809.179451,181797.447883,270853.514036,183579.390198,133926.022959,170464.587332,...,1286.185507,1751.036748,1383.192096,1283.038454,1750.281954,108.0,0.520436,341732.0,142.478673,331900.0
4,"Houston, TX",TX,2020-06-30,3.954540e+05,234440.009994,156726.798926,234904.625421,128670.425313,111340.432485,143931.616756,...,1243.348378,1708.691620,1375.329884,1247.742053,1703.780426,46.0,0.634123,303720.0,130.118343,306997.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5949,"Burlington, NC",NC,2025-03-31,4.261431e+05,288087.556339,188068.687464,294439.326895,229255.599967,135550.291098,182464.372871,...,1293.038964,1740.784086,1532.685122,1289.201269,1742.359451,71.0,0.542857,387500.0,172.181351,325000.0
5950,"Coeur d'Alene, ID",ID,2025-03-31,9.745493e+05,579544.543582,439605.325510,592639.216180,444773.787424,452692.237506,465385.582532,...,1606.621672,2373.684270,1903.084635,1626.643858,2391.402830,48.0,0.563636,637450.0,285.040945,675000.0
5951,"Homosassa Springs, FL",FL,2025-03-31,4.419423e+05,278521.607429,196035.541661,286283.642360,181053.046071,144811.417838,207104.462386,...,1273.570758,1770.604870,1727.971464,1251.004036,1771.582570,33.0,0.771689,265000.0,178.448868,315450.0
5952,"New Bern, NC",NC,2025-03-31,4.019812e+05,251865.956852,135448.106946,254863.558846,237662.941327,174217.015791,164824.003477,...,1389.086809,1636.428276,1517.113848,1382.782403,1658.067864,51.0,0.602041,344275.0,157.252286,325000.0


In [5]:
# Select only numeric columns for mean, median, mode calculation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

In [6]:
# Calculate mean, median, mode, standard deviation, skewness and kurtosis
mean_values = numeric_df.mean()
median_values = numeric_df.median()
mode_values = numeric_df.mode().iloc[0]  # mode() returns a DataFrame
std_values = numeric_df.std()
skew_values = numeric_df.skew()
kurtosis_values = numeric_df.kurtosis()

# Combine all statistics into a single DataFrame for display
stats = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values,
    'Std Dev': std_values,
    'Skewness': skew_values,
    'Kurtosis': kurtosis_values
})

In [7]:
display(stats)

Unnamed: 0,Mean,Median,Mode,Std Dev,Skewness,Kurtosis
TopTier,601358.17807,525034.516595,222893.780279,298644.145003,3.068111,14.66608
MiddleTier,368959.495655,327418.924148,139242.013361,169344.048784,2.842699,13.494555
BottomTier,243821.479722,213675.977142,75066.146211,123037.479757,2.164933,8.167391
SingleFamily,376630.203342,329610.258295,139769.79237,186129.520771,3.119413,15.947775
Condo,280996.453188,253328.39733,77001.463155,127924.814054,1.587996,3.828158
1BHK,212569.857998,182390.557248,60159.842987,108943.313525,1.49045,2.776027
2BHK,262835.921923,233332.738936,74223.096084,137049.133232,1.967227,6.10392
3BHK,349764.778888,307465.026089,140606.705953,168567.981673,2.982288,14.522111
4BHK,481018.317243,428584.833907,204352.614448,210230.486667,3.113652,14.61703
5+BHK,659575.84745,572147.22186,190624.382951,321730.740592,2.899397,11.302366


In [8]:
# Calculate the number and percentage of missing values per column
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100

# Combine into a single DataFrame
missing_data_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percent.round(2)
})
# Filter only columns with missing data
missing_data_summary = missing_data_summary[missing_data_summary['Missing Count'] > 0]

In [9]:
missing_data_summary

Unnamed: 0,Missing Count,Missing Percentage
StateName,58,0.97


In [10]:
df[df['StateName'].isnull()][['RegionName','StateName']]
# Here are the rows where StateName is missing. All of them have RegionName as "United States",
# which represents national-level aggregates rather than individual states.

Unnamed: 0,RegionName,StateName
0,United States,
83,United States,
166,United States,
250,United States,
333,United States,
419,United States,
508,United States,
599,United States,
688,United States,
777,United States,


In [11]:
# Keeping the rows where StateName is missing separately for national-level trends.

# Separate national-level data where StateName is missing
national_level_df = df[df['StateName'].isnull()]

# Delete the StateName column
national_level_df = national_level_df.drop(columns=['StateName'])

# Save this national-level subset to a separate CSV file
national_level_df.to_csv("/content/data/national_level_data.csv", index=False)

# Download the file
files.download("/content/data/national_level_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# Extract state-level data where StateName is not missing
state_level_df = df[df['StateName'].notnull()]

# Save the state-level subset to a separate CSV file
state_level_df.to_csv("/content/data/state_level_data.csv", index=False)

# Download the file
files.download("/content/data/state_level_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>