In [None]:
# Import necessary libraries
import pandas as pd

# Load the Chicago crime dataset in chunks
chunk_size = 1000
chicago_crime_url = r"C:\Users\Gideon\Desktop\From old\Datasets\crime_data_chicago.csv"
chunk_iterator = pd.read_csv(chicago_crime_url, chunksize=chunk_size)

In [None]:
# Export each chunk as a separate CSV file
for i, chunk in enumerate(chunk_iterator):
    # Save chunk to CSV
    output_file = f'chunk_{i}.csv'
    chunk.to_csv(output_file, index=False)

    print(f"Chunk {i} exported to {output_file}")

In [None]:
import pandas as pd
crime_df = pd.read_csv('chicago1.csv') # load your data here

In [None]:
crime_df.head()

In [None]:
crime_df.index += 1 # crime_df.index = crime_df.index + 1

In [None]:
crime_df.head()

In [None]:
# the dimensions of the dataset
crime_df.shape

In [None]:
crime_df.columns

In [None]:
crime_df.info()

In [None]:
# checking for missing values
crime_df.isna().sum()

In [None]:
def missing_percentage(data):
    row, column = data.shape
    return data.isna().sum()/row * 100

In [None]:
def missing_value_percentage(df):
    # Calculate the percentage of missing values for each column
    missing_percentage = df.isnull().mean().reset_index()
    missing_percentage.columns = ['Column', 'Missing Percentage']

    # Convert the proportion to percentage by multiplying by 100
    missing_percentage['Missing Percentage'] *= 100

    return missing_percentage


In [None]:
missing_value_percentage(crime_df)

Handling missing data in the Ward/Community Area column

In [None]:
crime_df.Ward.head()

In [None]:
crime_df.Ward.unique()

In [None]:
# 1. drop off all rows with missing Ward values
# try it yourself



In [None]:
# 2. using the mode to fill up the most common Ward, then replace it with the current Ward column
most_fq_ward = crime_df.Ward.mode()[0] # indexing the mode to get the value as it returns a series
crime_df.Ward = crime_df.Ward.fillna(most_fq_ward)

In [None]:
crime_df.isna().sum()

In [None]:
# 1. drop off all rows with missing Community Area values
# try it yourself

In [None]:
# 2. using the mode to fill up the most common Community Area, then replace it with the current Community Area column

In [None]:
crime_df['Community Area'].head()

Handling Missing Longitude and Latitude Values: Using Assosciation.
- The assumption here is that the longitide and latitude readings, points that show exact location, are closer to eachother within a district. if they are, we can group our data by districts and fill up missing Lattitide and Longitude values by the median or mode.

In [None]:
crime_df.District.unique()

In [None]:
# district 17 lat, lon values
crime_df.groupby('District').get_group(17)[['Longitude', 'Latitude']]

In [None]:
crime_df.groupby('District').get_group(15)[['Longitude', 'Latitude']]

There appears to be closeness in Lat/Lon points when data is grouped based on districts, we can group based on district and fill up with median or mode.

In [None]:
crime_df['Longitude'] = crime_df.groupby('District')['Longitude'].transform(lambda x: x.fillna(x.median()))
crime_df['Latitude'] = crime_df.groupby('District')['Latitude'].transform(lambda x: x.fillna(x.median()))

In [None]:
crime_df.isna().sum()

Attempt the same for x and y coordinates.

`REMEMBER`: You can fill up or drop values 'inplace' or 'replace' the column.

Checking for duplicates within the dataset

In [None]:
crime_df.duplicated().sum()

In [None]:
crime_df['Case Number'].duplicated().sum()

In [None]:
crime_df.drop_duplicates(inplace=True)

In [None]:
crime_df['Case Number'].nunique()

Feature Engineering & Temporal Analysis

In [None]:
from datetime import datetime

def extract_and_map_month(date):
    # ensure the input is in datetime format
    # 07/26/2008 02:30:00 PM
    date_column = pd.to_datetime(date, format='%m/%d/%Y %I:%M:%S %p')

    month_names = date_column.dt.month_name()

    return month_names

- %m: Represents the month as a zero-padded decimal number (01, 02, ..., 12).
- %d: Represents the day of the month as a zero-padded decimal number (01, 02, ..., 31).
- %Y: Represents the year with century as a decimal number (0001, 0002, ..., 2013, 2014, ..., 9998, 9999).
- %I: Represents the hour (12-hour clock) as a zero-padded decimal number (01, 02, ..., 12).
- %M: Represents the minute as a zero-padded decimal number (00, 01, ..., 59).
- %S: Represents the second as a zero-padded decimal number (00, 01, ..., 59).
- %p: Represents either AM or PM.
Putting it all together, the format '%m/%d/%Y %I:%M:%S %p' is indicating that the expected format of the input date strings is 'Month/Day/Year Hour:Minute:Second AM/PM'.

For example, if you have a date string like '08/21/2001 12:00:00 AM', the function will use this format to correctly parse the date components. This helps Pandas understand the structure of the date strings and convert them to a datetime format.

In [None]:
crime_df['Month'] = extract_and_map_month(crime_df['Date'])

In [None]:
crime_df.head()

In [None]:
crime_df.groupby('Month')['Primary Type'].value_counts()

 To find the month with the highest crime, we group the data by the 'Month' column and then count the occurrences of each unique 'Month' value. This would give you a count of how many records exist for each month in the dataset.

In [None]:
crime_counts_by_month = crime_df.groupby('Month')['Month'].count()

In [None]:
# Find the month with the highest crime count
highest_crime_month = crime_counts_by_month.idxmax()

print("Month with the highest crime count:", highest_crime_month)

In [None]:
crime_counts = crime_df['Primary Type'].value_counts()

# Find the most common crime and its count
most_common_crime = crime_counts.idxmax()
crime_count = crime_counts.max()

print("Most common crime:", most_common_crime)
print("Number of occurrences:", crime_count)

In [None]:
# creating a column for seasons

def create_seasons_column(date_column):
    """
    Creates a new column representing seasons based on the months in a datetime column.

    Parameters:
    - date_column: Pandas Series, representing a datetime feature

    Returns:
    - Pandas Series containing the corresponding season names
    """
    # Ensure the input is in datetime format
    date_column = pd.to_datetime(date_column, format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

    # Define a mapping of month to season
    month_to_season = {
        1: 'Winter', 2: 'Winter', 3: 'Spring',
        4: 'Spring', 5: 'Spring', 6: 'Summer',
        7: 'Summer', 8: 'Summer', 9: 'Fall',
        10: 'Fall', 11: 'Fall', 12: 'Winter'
    }

    # Extract the month and map to season
    seasons_column = date_column.dt.month.map(month_to_season)

    return seasons_column

In [None]:
crime_df['Seasons'] = create_seasons_column(crime_df['Date'])

In [None]:
crime_df.loc[:, 'Seasons'].head()