In [9]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("population_data.csv")

# Step 1: Remove footnotes from the 'Country or territory' column
df['Country or territory'] = df['Country or territory'].str.replace(r"\[.*\]", "", regex=True)

# Step 2: Handle missing values
# Replace '–' with NaN
df.replace("–", pd.NA, inplace=True)

# Step 3: Convert data types
# Convert population columns to numeric
df['Population (1 July 2022)'] = pd.to_numeric(df['Population (1 July 2022)'], errors='coerce')
df['Population (1 July 2023)'] = pd.to_numeric(df['Population (1 July 2023)'], errors='coerce')

# Convert 'Change (%)' to numeric by removing '%' and handling special minus signs
df['Change (%)'] = df['Change (%)'].str.replace('%', '', regex=False)
df['Change (%)'] = df['Change (%)'].str.replace('−', '-', regex=False)  # Unicode minus to standard hyphen
df['Change (%)'] = df['Change (%)'].str.replace('+', '', regex=False)   # Optional: remove plus signs
df['Change (%)'] = pd.to_numeric(df['Change (%)'], errors='coerce')

# Step 4: Standardize column names
df.columns = [
    'Country',
    'Population_2022',
    'Population_2023',
    'Change_Percent',
    'UN_Continental_Region',
    'UN_Statistical_Subregion'
]

# Step 5: Remove duplicates
df.drop_duplicates(inplace=True)

# Step 6: Save the cleaned data to a new CSV file
df.to_csv("cleaned_population_data.csv", index=False)

print("✅ Data cleaning and preprocessing complete. Cleaned data saved to 'cleaned_population_data.csv'.")


✅ Data cleaning and preprocessing complete. Cleaned data saved to 'cleaned_population_data.csv'.


In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("karnataka_districts_2011.csv")

# Step 1: Remove 'Sub-District' column (if it exists)
if 'Sub-District' in df.columns:
    df.drop(columns=['Sub-District'], inplace=True)

# Step 2: Replace common placeholders like '–' with NaN
df.replace("–", pd.NA, inplace=True)

# Step 3: Clean numeric columns (if needed, adjust column names accordingly)
# Example: Convert population and area columns to numeric
numeric_columns = ['Population', 'Area', 'Density']  # Replace with actual column names
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 4: Remove footnote references from any relevant columns
df.columns = [col.strip().replace("[1]", "").replace("[a]", "") for col in df.columns]

# Step 5: Remove duplicates
df.drop_duplicates(inplace=True)

# Step 6: Save the cleaned data
df.to_csv("cleaned_karnataka_districts_2011.csv", index=False)

print("✅ Cleaned data saved to 'cleaned_karnataka_districts_2011.csv'")


✅ Cleaned data saved to 'cleaned_karnataka_districts_2011.csv'


In [11]:
import pandas as pd

# Load the CSV file
file_path = "karnataka_districts_2011.csv"
df = pd.read_csv(file_path)

# Drop the "Sub-Districts" column
df_cleaned = df.drop(columns=["Sub-Districts"])

# Clean percentage columns and convert to float
df_cleaned["Increase (%)"] = df_cleaned["Increase (%)"].str.replace('%', '').astype(float)
df_cleaned["Literacy (%)"] = df_cleaned["Literacy (%)"].str.replace('%', '').astype(float)

# Clean and convert numeric columns
df_cleaned["Population"] = df_cleaned["Population"].str.replace(',', '').astype(int)
df_cleaned["Sex Ratio"] = df_cleaned["Sex Ratio"].astype(int)
df_cleaned["Density"] = df_cleaned["Density"].astype(int)

# Save to a new cleaned CSV file
cleaned_file_path = "karnataka_districts_2011_cleaned.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Cleaned file saved as: {cleaned_file_path}")


Cleaned file saved as: karnataka_districts_2011_cleaned.csv


In [13]:
import pandas as pd

# Load the dataset
df = pd.read_csv("india_population_2011_corrected.csv")

# Clean 'Literacy Rate' by removing '%' and converting to float
df["Literacy Rate"] = df["Literacy Rate"].str.replace('%', '').astype(float)

# Clean 'Decade Growth (%)'
# Replace non-standard minus sign and remove '%'
df["Decade Growth (%)"] = (
    df["Decade Growth (%)"]
    .str.replace('−', '-')  # Replace non-standard minus with normal one
    .str.replace('%', '')   # Remove percent symbol
    .astype(float)
)

# Convert numeric columns to appropriate types
numeric_columns = [
    "Population", "Male", "Female", "Rural Population",
    "Urban Population", "Area (km²)", "Population Density"
]
for col in numeric_columns:
    df[col] = df[col].astype(int)

# Save cleaned data
df.to_csv("india_population_2011_cleaned.csv", index=False)
print("Cleaned file saved as: india_population_2011_cleaned.csv")


Cleaned file saved as: india_population_2011_cleaned.csv
