In [5]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("CrimesOnWomenData_Updated_Till_2024.csv")

# Show the top 5 rows to verify
df.head()

Unnamed: 0,Numbers,State,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking,Notes
0,0,ANDHRA PRADESH,2001,871,765,420,3544,2271,5791,7,
1,1,ARUNACHAL PRADESH,2001,33,55,0,78,3,11,0,
2,2,ASSAM,2001,817,1070,59,850,4,1248,0,
3,3,BIHAR,2001,888,518,859,562,21,1558,83,
4,4,CHHATTISGARH,2001,959,171,70,1763,161,840,0,


In [6]:
# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(".", "", regex=False)

# Drop unnecessary columns
df.drop(columns=['numbers', 'notes'], inplace=True)

# Confirm changes
df.head()

Unnamed: 0,state,year,no_of_rape_cases,kidnap_and_assault,dowry_deaths,assault_against_women,assault_against_modesty_of_women,domestic_violence,women_trafficking
0,ANDHRA PRADESH,2001,871,765,420,3544,2271,5791,7
1,ARUNACHAL PRADESH,2001,33,55,0,78,3,11,0
2,ASSAM,2001,817,1070,59,850,4,1248,0
3,BIHAR,2001,888,518,859,562,21,1558,83
4,CHHATTISGARH,2001,959,171,70,1763,161,840,0


In [7]:
# Check null values
df.isnull().sum()

# Fill nulls with 0
df.fillna(0, inplace=True)

In [8]:
# Ensure 'year' is integer
df['year'] = df['year'].astype(int)

# Convert crime counts to numeric
crime_cols = df.columns.drop(['state', 'year'])
df[crime_cols] = df[crime_cols].apply(pd.to_numeric, errors='coerce')

In [9]:
def assign_period(year):
    if year <= 2010:
        return '2001–2010: Pre-Nirbhaya'
    elif year <= 2015:
        return '2011–2015: Post-Nirbhaya'
    elif year <= 2019:
        return '2016–2019: Digital & Policing'
    elif year <= 2022:
        return '2020–2022: COVID Era'
    else:
        return '2023–2024: Provisional'

df['period'] = df['year'].apply(assign_period)

# Preview
df.head()


Unnamed: 0,state,year,no_of_rape_cases,kidnap_and_assault,dowry_deaths,assault_against_women,assault_against_modesty_of_women,domestic_violence,women_trafficking,period
0,ANDHRA PRADESH,2001,871.0,765.0,420,3544.0,2271,5791,7,2001–2010: Pre-Nirbhaya
1,ARUNACHAL PRADESH,2001,33.0,55.0,0,78.0,3,11,0,2001–2010: Pre-Nirbhaya
2,ASSAM,2001,817.0,1070.0,59,850.0,4,1248,0,2001–2010: Pre-Nirbhaya
3,BIHAR,2001,888.0,518.0,859,562.0,21,1558,83,2001–2010: Pre-Nirbhaya
4,CHHATTISGARH,2001,959.0,171.0,70,1763.0,161,840,0,2001–2010: Pre-Nirbhaya


In [11]:
df.columns.tolist()

['state',
 'year',
 'no_of_rape_cases',
 'kidnap_and_assault',
 'dowry_deaths',
 'assault_against_women',
 'assault_against_modesty_of_women',
 'domestic_violence',
 'women_trafficking',
 'period']

In [13]:
crime_columns = [
    'no_of_rape_cases',
    'kidnap_and_assault',
    'dowry_deaths',
    'assault_against_women',
    'assault_against_modesty_of_women',
    'domestic_violence',
    'women_trafficking'
]

df['total_crimes'] = df[crime_columns].sum(axis=1)

In [14]:
df[['state', 'year', 'total_crimes']].head()

Unnamed: 0,state,year,total_crimes
0,ANDHRA PRADESH,2001,13669.0
1,ARUNACHAL PRADESH,2001,180.0
2,ASSAM,2001,4048.0
3,BIHAR,2001,4489.0
4,CHHATTISGARH,2001,3964.0


In [15]:
df['is_provisional'] = df['year'].apply(lambda x: 1 if x >= 2023 else 0)

In [16]:
df.to_csv("Cleaned_Crime_Data_2001_2024.csv", index=False)

In [17]:
import pandas as pd

df = pd.read_csv("CrimesOnWomenData_Updated_Till_2024.csv")
df.head()  # Check first few rows

Unnamed: 0,Numbers,State,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking,Notes
0,0,ANDHRA PRADESH,2001,871,765,420,3544,2271,5791,7,
1,1,ARUNACHAL PRADESH,2001,33,55,0,78,3,11,0,
2,2,ASSAM,2001,817,1070,59,850,4,1248,0,
3,3,BIHAR,2001,888,518,859,562,21,1558,83,
4,4,CHHATTISGARH,2001,959,171,70,1763,161,840,0,


In [18]:
df.isnull().sum()

Numbers                               0
State                                 0
Year                                  0
No. of Rape cases                     0
Kidnap And Assault                    0
Dowry Deaths                          0
Assault against women                 0
Assault against modesty of women      0
Domestic violence                     0
Women Trafficking                     0
Notes                               759
dtype: int64

In [19]:
df['Notes'] = df['Notes'].fillna('NA')

In [20]:
df.duplicated().sum()

0

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763 entries, 0 to 762
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Numbers                           763 non-null    int64 
 1   State                             763 non-null    object
 2   Year                              763 non-null    int64 
 3   No. of Rape cases                 763 non-null    object
 4   Kidnap And Assault                763 non-null    object
 5   Dowry Deaths                      763 non-null    int64 
 6   Assault against women             763 non-null    object
 7   Assault against modesty of women  763 non-null    int64 
 8   Domestic violence                 763 non-null    int64 
 9   Women Trafficking                 763 non-null    int64 
 10  Notes                             763 non-null    object
dtypes: int64(6), object(5)
memory usage: 65.7+ KB


In [23]:
import re

# Use regex to extract only digits
for col in cols_to_convert:
    df[col] = df[col].astype(str).str.extract(r'(\d+)')  # Extract numeric part
    df[col] = df[col].astype(float).fillna(0).astype(int)  # Convert to int safely

In [24]:
df[cols_to_convert].dtypes

No. of Rape cases        int32
Kidnap And Assault       int32
Assault against women    int32
dtype: object

In [25]:
df['State'].unique()

array(['ANDHRA PRADESH', 'ARUNACHAL PRADESH', 'ASSAM', 'BIHAR',
       'CHHATTISGARH', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH',
       'JAMMU & KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA',
       'MADHYA PRADESH', 'MAHARASHTRA', 'MANIPUR', 'MEGHALAYA', 'MIZORAM',
       'NAGALAND', 'ODISHA', 'PUNJAB', 'RAJASTHAN', 'SIKKIM',
       'TAMIL NADU', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND',
       'WEST BENGAL', 'A & N ISLANDS', 'CHANDIGARH', 'D & N HAVELI',
       'DAMAN & DIU', 'LAKSHADWEEP', 'PUDUCHERRY', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa',
       'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir',
       'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh',
       'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland',
       'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
       'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'A & N Islands', 'Chandigarh', 'D&N Haveli',
  

In [26]:
# Strip spaces and convert to Title Case (e.g., "Andhra Pradesh")
df['State'] = df['State'].str.strip().str.title()

In [27]:
df['State'].unique()

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu & Kashmir', 'Jharkhand', 'Karnataka', 'Kerala',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
       'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'A & N Islands', 'Chandigarh', 'D & N Haveli',
       'Daman & Diu', 'Lakshadweep', 'Puducherry', 'Telangana',
       'D&N Haveli', 'Delhi Ut', 'Delhi'], dtype=object)

In [28]:
# Create a mapping dictionary
state_name_mapping = {
    'A & N Islands': 'Andaman & Nicobar Islands',
    'D & N Haveli': 'Dadra & Nagar Haveli and Daman & Diu',
    'D&N Haveli': 'Dadra & Nagar Haveli and Daman & Diu',
    'Daman & Diu': 'Dadra & Nagar Haveli and Daman & Diu',
    'Delhi Ut': 'Delhi',
    'Delhi': 'Delhi',  # Keep as 'Delhi'
}

# Replace using the mapping
df['State'] = df['State'].replace(state_name_mapping)


In [29]:
df['State'].unique()

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu & Kashmir', 'Jharkhand', 'Karnataka', 'Kerala',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
       'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'Andaman & Nicobar Islands', 'Chandigarh',
       'Dadra & Nagar Haveli and Daman & Diu', 'Lakshadweep',
       'Puducherry', 'Telangana', 'Delhi'], dtype=object)

In [30]:
df['Year'].min(), df['Year'].max()

(2001, 2024)

In [31]:
df.describe()

Unnamed: 0,Numbers,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking
count,763.0,763.0,763.0,763.0,763.0,763.0,763.0,763.0,763.0
mean,381.0,2011.542595,1163.338139,1096.336828,208.060288,1523.237221,320.948886,2503.247706,27.727392
std,220.403418,6.291202,3912.564602,1968.748951,419.231993,2437.46152,794.002121,3998.635159,78.749072
min,0.0,2001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,190.5,2006.0,36.0,14.0,0.0,25.5,2.0,9.0,0.0
50%,381.0,2012.0,361.0,249.0,22.0,342.0,28.0,588.0,0.0
75%,571.5,2017.0,1097.0,1158.5,254.0,1944.5,255.5,3426.0,12.0
max,762.0,2024.0,59691.0,15381.0,2524.0,14853.0,9422.0,23278.0,549.0


In [32]:
df.to_csv("Cleaned_CrimesOnWomen_2001_2024.csv", index=False)