In [25]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Importing Data

In [26]:
# raw_data directory
raw_data_path = "../Data/raw_data"
# Initialize an empty list to store DataFrames
df_list = []

# Use os.walk to traverse the directory and find all CSV files
for root, dirs, files in os.walk(raw_data_path):
    for file in files:
        if file.endswith(".csv"):
            print(f"Found file: {file}")  # Confirm files are found
            # Get the full file path
            file_path = os.path.join(root, file)
            # Read the CSV file into a DataFrame and add it to the list
            df_list.append(pd.read_csv(file_path))



# Check if df_list is empty before concatenating
if df_list:
    # Concatenate all DataFrames into one
    combined_df = pd.concat(df_list, ignore_index=True)
    print("All CSV files have been successfully read into a single DataFrame.")
else:
    print("No CSV files found in the directory.")

# display the first few rows of the combined DataFrame
combined_df.head()


Found file: 2023-01-cleveland-street.csv
Found file: 2023-02-cleveland-street.csv
Found file: 2023-03-cleveland-street.csv
Found file: 2023-04-cleveland-street.csv
Found file: 2023-05-cleveland-street.csv
Found file: 2023-06-cleveland-street.csv
Found file: 2023-07-cleveland-street.csv
Found file: 2023-08-cleveland-street.csv
Found file: 2023-09-cleveland-street.csv
Found file: 2023-10-cleveland-street.csv
Found file: 2023-11-cleveland-street.csv
Found file: 2023-12-cleveland-street.csv
Found file: 2024-01-cleveland-street.csv
Found file: 2024-02-cleveland-street.csv
Found file: 2024-03-cleveland-street.csv
Found file: 2024-04-cleveland-street.csv
Found file: 2024-05-cleveland-street.csv
Found file: 2024-06-cleveland-street.csv
All CSV files have been successfully read into a single DataFrame.


Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,a4c83212c5e08e368dc841371fc26a62f7a476d19dd836...,2023-01,Cleveland Police,Cleveland Police,-1.212283,54.506084,On or near Bromley Lane,E01027626,Hambleton 002D,Other theft,Investigation complete; no suspect identified,
1,,2023-01,Cleveland Police,Cleveland Police,-1.23566,54.710526,On or near Dobson Place,E01011954,Hartlepool 001A,Anti-social behaviour,,
2,,2023-01,Cleveland Police,Cleveland Police,-1.237805,54.711196,On or near King Oswy Shops,E01011954,Hartlepool 001A,Anti-social behaviour,,
3,,2023-01,Cleveland Police,Cleveland Police,-1.236325,54.713146,On or near Lazenby Road,E01011954,Hartlepool 001A,Anti-social behaviour,,
4,c108d52a0f5bb4dd8e7f50ad9bd75e583a0a0d2ab1d1ee...,2023-01,Cleveland Police,Cleveland Police,-1.239865,54.710589,On or near Marshall Close,E01011954,Hartlepool 001A,Burglary,Investigation complete; no suspect identified,


In [27]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142284 entries, 0 to 142283
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Crime ID               116539 non-null  object 
 1   Month                  142284 non-null  object 
 2   Reported by            142284 non-null  object 
 3   Falls within           142284 non-null  object 
 4   Longitude              142284 non-null  float64
 5   Latitude               142284 non-null  float64
 6   Location               142284 non-null  object 
 7   LSOA code              142284 non-null  object 
 8   LSOA name              142284 non-null  object 
 9   Crime type             142284 non-null  object 
 10  Last outcome category  116539 non-null  object 
 11  Context                0 non-null       float64
dtypes: float64(3), object(9)
memory usage: 13.0+ MB


In [28]:
combined_df.describe()

Unnamed: 0,Longitude,Latitude,Context
count,142284.0,142284.0,0.0
mean,-1.220911,54.586892,
std,0.091619,0.050365,
min,-1.436189,54.472215,
25%,-1.289756,54.555941,
50%,-1.230299,54.570602,
75%,-1.191889,54.606144,
max,-0.792048,54.732026,


In [29]:
combined_df.shape

(142284, 12)

In [30]:
combined_df.value_counts

<bound method DataFrame.value_counts of                                                  Crime ID    Month  \
0       a4c83212c5e08e368dc841371fc26a62f7a476d19dd836...  2023-01   
1                                                     NaN  2023-01   
2                                                     NaN  2023-01   
3                                                     NaN  2023-01   
4       c108d52a0f5bb4dd8e7f50ad9bd75e583a0a0d2ab1d1ee...  2023-01   
...                                                   ...      ...   
142279  33c553f355dbd08308638ac06e166f5a9ca1590f9d3280...  2024-06   
142280  daab4eba0483b218c0da158b7146485c161886a0456ad8...  2024-06   
142281  73a6916f5fa27c5d68fbcebcc6bf26843357aae6442657...  2024-06   
142282  c3017459024fd28ca813b7d632d799da1dfefd8c3cc961...  2024-06   
142283                                                NaN  2024-06   

             Reported by      Falls within  Longitude   Latitude  \
0       Cleveland Police  Cleveland Police  -1.2122

In [31]:
combined_df.head(5)

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,a4c83212c5e08e368dc841371fc26a62f7a476d19dd836...,2023-01,Cleveland Police,Cleveland Police,-1.212283,54.506084,On or near Bromley Lane,E01027626,Hambleton 002D,Other theft,Investigation complete; no suspect identified,
1,,2023-01,Cleveland Police,Cleveland Police,-1.23566,54.710526,On or near Dobson Place,E01011954,Hartlepool 001A,Anti-social behaviour,,
2,,2023-01,Cleveland Police,Cleveland Police,-1.237805,54.711196,On or near King Oswy Shops,E01011954,Hartlepool 001A,Anti-social behaviour,,
3,,2023-01,Cleveland Police,Cleveland Police,-1.236325,54.713146,On or near Lazenby Road,E01011954,Hartlepool 001A,Anti-social behaviour,,
4,c108d52a0f5bb4dd8e7f50ad9bd75e583a0a0d2ab1d1ee...,2023-01,Cleveland Police,Cleveland Police,-1.239865,54.710589,On or near Marshall Close,E01011954,Hartlepool 001A,Burglary,Investigation complete; no suspect identified,


#### checking the unique counts of the crime type column.


In [32]:
combined_df["Crime type"].value_counts()

Crime type
Violence and sexual offences    47065
Anti-social behaviour           25745
Criminal damage and arson       14347
Shoplifting                     11822
Public order                    10834
Other theft                      7851
Burglary                         6962
Vehicle crime                    6691
Drugs                            3823
Other crime                      3271
Robbery                          1262
Possession of weapons            1096
Bicycle theft                     936
Theft from the person             579
Name: count, dtype: int64

#### checking the unique counts of the Last outcome category column.


In [33]:
combined_df["Last outcome category"].value_counts()

Last outcome category
Unable to prosecute suspect                            50891
Investigation complete; no suspect identified          45560
Court result unavailable                                5235
Under investigation                                     4659
Awaiting court outcome                                  4615
Status update unavailable                               1913
Further action is not in the public interest             791
Action to be taken by another organisation               728
Further investigation is not in the public interest      664
Formal action is not in the public interest              625
Offender given a caution                                 359
Local resolution                                         189
Offender given penalty notice                             29
Suspect charged as part of another case                   23
Name: count, dtype: int64

#### Checking for Null values 

In [34]:
combined_df.isnull().sum()

Crime ID                  25745
Month                         0
Reported by                   0
Falls within                  0
Longitude                     0
Latitude                      0
Location                      0
LSOA code                     0
LSOA name                     0
Crime type                    0
Last outcome category     25745
Context                  142284
dtype: int64

##### * The null values in the crime ID column and Last outcome category column are the same. 
##### * The Context colunm doesnt have any value, its all Null. Delete?

##### Filtering through the df to identify relationship between CrimeID and Null values. 

In [35]:
filtered_df= combined_df[combined_df["Crime ID"].isnull()]
filtered_df

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
1,,2023-01,Cleveland Police,Cleveland Police,-1.235660,54.710526,On or near Dobson Place,E01011954,Hartlepool 001A,Anti-social behaviour,,
2,,2023-01,Cleveland Police,Cleveland Police,-1.237805,54.711196,On or near King Oswy Shops,E01011954,Hartlepool 001A,Anti-social behaviour,,
3,,2023-01,Cleveland Police,Cleveland Police,-1.236325,54.713146,On or near Lazenby Road,E01011954,Hartlepool 001A,Anti-social behaviour,,
19,,2023-01,Cleveland Police,Cleveland Police,-1.251890,54.711643,On or near Westwood Way,E01011971,Hartlepool 001D,Anti-social behaviour,,
24,,2023-01,Cleveland Police,Cleveland Police,-1.244546,54.711795,On or near Rafton Drive,E01033465,Hartlepool 001F,Anti-social behaviour,,
...,...,...,...,...,...,...,...,...,...,...,...,...
142261,,2024-06,Cleveland Police,Cleveland Police,-1.317364,54.528175,On or near Brough Field Close,E01033475,Stockton-on-Tees 028A,Anti-social behaviour,,
142265,,2024-06,Cleveland Police,Cleveland Police,-1.327886,54.518788,On or near Glyder Court,E01035204,Stockton-on-Tees 028B,Anti-social behaviour,,
142270,,2024-06,Cleveland Police,Cleveland Police,-1.330211,54.514541,On or near Newgale Close,E01012231,Stockton-on-Tees 028C,Anti-social behaviour,,
142271,,2024-06,Cleveland Police,Cleveland Police,-1.323492,54.512617,On or near Owls Grove,E01012231,Stockton-on-Tees 028C,Anti-social behaviour,,


##### The null values in the crimeID and LastOutcomeCategory are correllated with the AntiSocialBehaviour Column.
##### AntisocialBehaviour has no crimeId and doesnt have a last outcome. Maybe its not treated as a crime.
##### The Nan value in LAstOutcomeCategory will be filled with "statusUnknown"

In [40]:
combined_df["Last outcome category"].fillna("Status_unknown")

0         Investigation complete; no suspect identified
1                                        Status_unknown
2                                        Status_unknown
3                                        Status_unknown
4         Investigation complete; no suspect identified
                              ...                      
142279                              Under investigation
142280                              Under investigation
142281    Investigation complete; no suspect identified
142282                      Unable to prosecute suspect
142283                                   Status_unknown
Name: Last outcome category, Length: 142284, dtype: object