In [None]:
import pandas as pd
import numpy as np


In [None]:

train_df = pd.read_csv(r'train.csv')
test_df = pd.read_csv(r'test.csv')

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:

print("First 5 rows of the training data:")
train_df.head()


In [None]:

print("\nFirst 5 rows of the test data:")
test_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:

print("\nTraining data information:")
print(train_df.info())

In [None]:

print("\nTest data information:")
print(test_df.info())



In [None]:
train_df.duplicated().sum()


In [None]:
train_df.drop_duplicates(inplace=True)

In [None]:
test_df.duplicated().sum()


In [None]:
train_df["Address"].nunique()

In [None]:
train_df['Dates'] = pd.to_datetime(train_df['Dates'])

test_df['Dates'] = pd.to_datetime(test_df['Dates'])

(train_df[['Dates']].head())


In [None]:
sf_geobounds = {
    'min_lat': 37.70,  
    'max_lat': 37.84,   
    'min_lon': -122.52, 
    'max_lon': -122.35
}


In [None]:

train_df['is_in_sf'] = np.where(
    (train_df['Y'].between(sf_geobounds['min_lat'], sf_geobounds['max_lat'])) & 
    (train_df['X'].between(sf_geobounds['min_lon'], sf_geobounds['max_lon'])),
    True,  
    False  
)

print(f"Number of crimes outside the boundaries: {len(train_df[train_df['is_in_sf'] == False])}")


In [None]:
suspicious_crimes = train_df[train_df['is_in_sf'] == False]
print(suspicious_crimes['Category'].value_counts().head(5))

ocean_crimes = suspicious_crimes[(suspicious_crimes['Y'] < 37.70) | (suspicious_crimes['X'] < -122.52)]
print(f"Crimes in the ocean: {len(ocean_crimes)}")


In [None]:
df_clean = train_df[train_df['is_in_sf'] == True].copy()

print(f"Original data: {len(train_df)} records")
print(f"After cleaning: {len(df_clean)} records")


In [None]:
df_clean['Year'] = df_clean['Dates'].dt.year
df_clean['Month'] = df_clean['Dates'].dt.month
df_clean['Day'] = df_clean['Dates'].dt.day
df_clean['Hour'] = df_clean['Dates'].dt.hour
df_clean['DayOfWeek'] = df_clean['Dates'].dt.day_name() 

print(df_clean[['Dates', 'Year', 'Month', 'Day', 'Hour', 'DayOfWeek']].head())

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_clean['Category_encoded'] = le.fit_transform(df_clean['Category'])

print(df_clean[['Category', 'Category_encoded']].head(10))

In [None]:
district_encoded = pd.get_dummies(df_clean['PdDistrict'], prefix='District')

df_clean = pd.concat([df_clean, district_encoded], axis=1)

print(df_clean.filter(regex='District_').head())

In [None]:
df_clean["Category"].value_counts()

In [None]:
df_clean["Resolution"].value_counts()

In [None]:
df_clean["DayOfWeek"].value_counts()

# Hierarchical Cluster Algorithm is supported by the Genetic Algorithm.

In [None]:
# Import my libraries
from sklearn.cluster import AgglomerativeClustering # to make cluster
from sklearn.metrics import silhouette_score # to evaluate this cluster
from sklearn.preprocessing import StandardScaler # to normalize the data
import random # to choose random values
from scipy.cluster.hierarchy import dendrogram,linkage,fcluster # to make cluster ,draw,know labels
import matplotlib.pyplot as plt # to draw 

In [None]:
# I will choose the numerical feature that i will use to cluster the data based on them (but we don't know those catogeries(unsupervised learning))
# we will normalize two columns by using standerscaler
data=df_clean[['X','Y']]
scalarx=StandardScaler()
scalary=StandardScaler()
x=np.array(data['X'])# I change them into array to can easily work with them (series are difficult)
y=np.array(data['Y'])

In [None]:
x=scalarx.fit_transform(x.reshape(-1,1)).flatten()# I use reshape to make it 2D 
y=scalarx.fit_transform(y.reshape(-1,1)).flatten()# flatten() to return it to 1D
newdata=pd.DataFrame({'X':x,'Y':y})
newdata=newdata.head(10000)# I choose only 10000 because the data is very large 
#but this model can't deal with large data well (he needs large space (2t))
newdata

In [None]:
# Zero generation of population has four parents with different data (different distance metric,different linkage criterion, threshold (k))
# we use genatic algorthim to choose the best values for those parameters
population={
'parent1':[2,'ward','euclidean'],
'parent2':[2,'single','manhattan'],
'parent3':[2,'complete','mahalanobis'],
'parent4':[2,'average','euclidean']}
population

In [None]:
# we will use fit function to choose the best 2 parents
# first we will fit model to each parent then we will evaluate by using silhouette_score
# we store each score ,then we ordered them to return the best 2 parents (with high si )
def fitness(population,data):
    totalscore={}
    for i in population.keys():
        model=AgglomerativeClustering(n_clusters=population[i][0],linkage=population[i][1],metric=population[i][2])
        labels=model.fit_predict(data)
        score=silhouette_score(data,labels)
        totalscore[i]=[population[i],score]
    totalscore=dict(sorted(totalscore.items(),key=lambda item: item[1][1],reverse=True))
   
    return dict(list(totalscore.items())[0:2])

In [None]:
# after we choose 2 parents we will do crossover between them and produce 2 new children 
def crossover(parents):
    newgeneration={}
    # new populattion has 2 best parents and 2 new children( we will use 2 old parents becasue they may be better than new children)
    parent1data=list(parents.values())[1][0]#[5, 'average', 'euclidean'] this line will return from this step (in the same form)
    parent2data=list(parents.values())[0][0]
    newgeneration['parent1']=parent1data
    newgeneration['parent2']=parent2data
    # we will create to children , we will cut in postion number 1 (from linkage),we can choose another position or do it randomly
    newgeneration['child1']=[parent1data[0],parent2data[1],parent2data[2]]
    newgeneration['child2']=[parent2data[0],parent1data[1],parent1data[2]]
    return newgeneration 

In [None]:
# Now we will make mutation to change the value of k 
# we will choose one solution randomly 
# then we will change k value we will increase it by adding 1
random.seed(42) # to choose the same position if we rerun the code.(unless : we will have different parameters at all time we run the code)
np.random.seed(42)
def mutation(newgeneration):
    randomsolutionposition=random.randint(0,3)
    list(newgeneration.items())[randomsolutionposition][1][0]+=1 # I arrive to k and add one
    return newgeneration
    

In [None]:
# Now we will do this function to fit the best model we will use recursive to arrive to the best model.
# we will stop after 5 iterations or more (as you like)
#we will use  k to can stop when we complete 5 iterations
# first we will take our population to calculate the fitness to all 4 solution.
# then we will take the 2 best parents (return from fitness function) to generate new childrens
# we will make all those steps to nextgeneration again until we finish our 5 iterations
# when we finish our iterations we will fitness the nextgeneration to choose the final best one soluation (parameters)
def choosebestmodel(population,data,k):
    if (k==5): 
        bestsolution=fitness(population,data)
        print(list(bestsolution.values())[0])
        return # stop condition 
        
    parents=fitness(population,data)
    newgeneration=crossover(parents)
    nextgeneration=mutation(newgeneration)
    
    # recursive
    choosebestmodel(nextgeneration,data,k=k+1)
    

In [None]:
# those parameters are very good ( silhouette_score is near from 1, this means a good model (cohesion)(separation) :-) )
bestsolution=choosebestmodel(population,newdata,0)

In [None]:
# After we know the best parameters . we can fit model direct using them and draw the dendrogram.
#(Now i will use linkaage with the good parameters that i get them from GA )
bestmodel=linkage(newdata,method='average',metric='euclidean')

In [None]:
# I will calculate the labels ( clusters ) .I will use t=2 ( best parameter from GA) ,
# maxclust means i want the data divide into only 2 clusters
# we will calculate the silhouette_score
# silhouette_score are similar at two algorithm 
labels=fcluster(bestmodel,t=2,criterion='maxclust')
score=score=silhouette_score(newdata,labels)
score

In [None]:
# I draw dendrogram and line divide the data into 2 groups
plt.figure(figsize=(15,7))
dendrogram(bestmodel,no_labels=True)
plt.axhline(y=.8,color='hotpink',linestyle='--')
plt.title(" Best Hierarchical cluster")
plt.show()

#  Random forest

First, we used Logistic Regression and Random Forest without any feature selection,
and the accuracy was very low. 
After applying feature selection, 
the accuracy improved significantly. 

In [None]:
# i convert categorical data into numeric labels to use in my Model
df_clean["PdDistrict_Label"] = le.fit_transform(df_clean["PdDistrict"])


In [None]:

from sklearn.ensemble import RandomForestClassifier
# I will use RandomForestClassifier to predict the crime PdDistrict

y = df_clean["PdDistrict_Label"]
x = df_clean[["X", "Y"]]



random_forest_model = RandomForestClassifier(
    n_estimators=100,# Number of trees in the forest
    max_depth=32,# Maximum depth of the tree
    random_state=1# Random seed for reproducibility
)
# Fit the model to the data
random_forest_model.fit(x, y)

In [None]:
test_df["PdDistrict_encoded"] = le.transform(test_df["PdDistrict"])# Convert categorical data into numeric labels
y_test = test_df["PdDistrict_encoded"]
X_test= test_df[["X", "Y"]]

In [None]:

train_acc = random_forest_model.score(x,y)
test_acc = random_forest_model.score(X_test, y_test)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Here, we used a heatmap to find the features most correlated with our target variable Category.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
 
df_clean["District"] = le.fit_transform(df_clean["Descript"])

df=df_clean[["X","Y", "Year", "Month", "Day", "Hour", "PdDistrict_Label", "Holiday", "Weekend", "Category_encoded", "District"]]
corMatrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corMatrix, annot=True, cmap="coolwarm", fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.show()

we find that the correlation between "X", "Y", "PdDistrict_Label","Descript_Label"and our target "Category" is very high

In [None]:
from sklearn.model_selection import train_test_split

df_clean["Descript_Label"] = le.fit_transform(df_clean["Descript"])
y=df_clean["Category_encoded"]
x=df_clean[["X", "Y", "PdDistrict_Label","Descript_Label"]]


X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [None]:

random_forest_modelcategory = RandomForestClassifier(
    n_estimators=100,
    max_depth=32,
    random_state=1
)

random_forest_modelcategory.fit(X_train, y_train)

In [None]:
train_acc = random_forest_modelcategory.score(x,y)
test_acc = random_forest_modelcategory.score(X_test, y_test)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# KMedoids method

In [None]:
# Import required libraries
import numpy as np
import warnings
import matplotlib.pyplot as plt
from tabulate import tabulate               # Import tabulate for creating formatted tables
import seaborn as sns
import plotly.express as px    # For interactive visualizations
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids             

In [None]:

warnings.filterwarnings("ignore")

#Takes a random sample of 10,000 points (reduced from 20,000 for faster computation)

#Uses numpy's random.choice to select indices without replacement

coords = train_df[(train_df['X'].between(-123, -122)) & 
                 (train_df['Y'].between(37.7, 38))][['X', 'Y']].copy()
sample_size = 10000  
sample_indices = np.random.choice(coords.shape[0], sample_size, replace=False)
sample_coords = coords.iloc[sample_indices]

scaler = StandardScaler()

sample_scaled = scaler.fit_transform(sample_coords)
# Initializes storage for silhouette scores
silhouette_scores = []
#Tests only k=2,3,4,5 clusters
k_values = [2, 3, 4, 5] 

#Creates KMedoids instance for each k with:
for k in k_values:
    try:
        # Use faster initialization and smaller max_iter
        kmedoids = KMedoids(n_clusters=k, 
                           #k-medoids++ initialization (smarter than random)
                           init='k-medoids++',
                           #Fixed random state for reproducibility
                           random_state=42,
                           #Euclidean distance metric
                           metric='euclidean',
                           #Limited to 50 iterations for speed
                           max_iter=50)  
        #Performs clustering and gets labels
        labels = kmedoids.fit_predict(sample_scaled)
        #Checks if at least 2 clusters were formed ---> required for silhouette score

        if len(np.unique(labels)) < 2:
            print(f"Skipping k={k} (only 1 cluster formed)")
            continue
        #Calculates silhouette score (measures cluster separation)
        #Stores and prints the score for each k    
        score = silhouette_score(sample_scaled, labels)
        silhouette_scores.append(score)
        print(f"k={k}: Silhouette Score = {score:.3f}")
        #To handle with errors
    except Exception as e:
        print(f"Error with k={k}: {str(e)}")
        continue

# Determine optimal k
if not silhouette_scores:
    raise ValueError("No valid clustering solutions found")

optimal_k = k_values[np.argmax(silhouette_scores)]
print(f"\nOptimal k: {optimal_k}")


In [None]:
# Perform final clustering using the optimal number of clusters (optimal_k)
# Using the same sample as before for consistency in results
final_kmedoids = KMedoids(
    n_clusters=optimal_k,       # Using the optimal k determined earlier
    init='k-medoids++',         # Smart initialization method for better centroids
    random_state=42,            # For reproducibility of results
    metric='euclidean',         # Using Euclidean distance for spatial data
    max_iter=50                 # Maximum iterations for convergence
)

# Fit the model to our scaled sample data and predict cluster labels
final_labels = final_kmedoids.fit_predict(sample_scaled)

# Prepare data for visualization by creating a copy of our sample coordinates
cluster_viz = sample_coords.copy()
# Add cluster labels to our visualization dataframe (converted to string for categorical coloring)
cluster_viz['Cluster'] = final_labels.astype(str)
# Add the crime category information from the original dataframe
cluster_viz['Category'] = train_df.iloc[sample_indices]['Category'].values

# Create an interactive scatter plot of the clusters
fig = px.scatter(
    cluster_viz, 
    x='X',                     
    y='Y',                     
    color='Cluster',          
    title=f'K-Medoids Clustering (k={optimal_k})', 
    hover_data=['Category']      # Show crime category when hovering over points
)

# Adjust marker properties for better visualization
fig.update_traces(
    marker=dict(
        size=4,                 # Smaller points for better visibility in dense areas
        opacity=0.7             # Slightly transparent to handle overlapping points
    )
)

# Display the interactive plot
fig.show()

In [None]:
# Loop through each cluster (from 0 to optimal_k-1)
for cluster in range(optimal_k):
    # Filter data for the current cluster only
    cluster_data = cluster_viz[cluster_viz['Cluster'] == str(cluster)]
    
    # Create statistics table for basic cluster information
    stats_table = [
        ["Number of points", len(cluster_data)],  # Count of crimes in this cluster
        ["X coordinate range", f"{cluster_data['X'].min():.2f} to {cluster_data['X'].max():.2f}"],  # Longitude range
        ["Y coordinate range", f"{cluster_data['Y'].min():.2f} to {cluster_data['Y'].max():.2f}"],  # Latitude range
        ["Mean coordinates", f"X={cluster_data['X'].mean():.2f}, Y={cluster_data['Y'].mean():.2f}"]  # Cluster center
    ]
    
    # Print cluster header with decorative borders
    print(f"\n╒{'═'*50}╕")
    print(f"│ {'Cluster ' + str(cluster):^48} │")  # Centered cluster title
    print(f"╞{'═'*50}╡")
    # Print the statistics table with grid formatting
    print(tabulate(stats_table, tablefmt="simple_grid"))
    
    # Analyze and display top crime categories in this cluster
    top_crimes = cluster_data['Category'].value_counts().head(3).reset_index()
    top_crimes.columns = ['Crime Category', 'Count']  # Rename columns
    print("\nTop Crime Categories:")
    # Print top crimes with pretty formatting
    print(tabulate(top_crimes, headers='keys', tablefmt="pretty", showindex=False))

In [None]:
# Create a figure with subplots - one for each cluster
# Adjust figure size based on number of clusters (optimal_k)
fig, axes = plt.subplots(optimal_k, 1, figsize=(12, 6*optimal_k), dpi=100)

# Loop through each cluster to analyze crime distribution
for cluster in range(optimal_k):
    # Filter data for current cluster
    cluster_data = cluster_viz[cluster_viz['Cluster'] == str(cluster)]
    
    # Get top 20 most frequent crime categories in this cluster
    top_crimes = cluster_data['Category'].value_counts().head(20)
    
    # Create bar plot for this cluster's top crimes
    bars = axes[cluster].bar(top_crimes.index, top_crimes.values, 
                           color=plt.cm.tab20(cluster),  # Different color per cluster
                           alpha=0.7)
    
    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        axes[cluster].text(bar.get_x() + bar.get_width()/2., height,
                         f'{height}',
                         ha='center', va='bottom')
    
    # Customize plot appearance
    axes[cluster].set_title(f'Cluster {cluster} - Top Crime Categories (Total: {len(cluster_data)} crimes)',
                          fontsize=12, pad=15)
    axes[cluster].set_ylabel('Number of Crimes', fontsize=10)
    axes[cluster].tick_params(axis='x', rotation=90)
    axes[cluster].grid(axis='y', linestyle=':', alpha=0.7)
    

# Adjust layout to prevent overlapping elements
plt.tight_layout(pad=3.0)

# Add overall figure title
fig.suptitle('Crime Category Distribution Across Clusters', 
            y=1.02, fontsize=14, fontweight='bold')

plt.show()

In [None]:
plt.figure(figsize=(12, 6))  # Slightly larger figure for better readability

# Create the countplot
ax = sns.countplot(x='DayOfWeek', 
                  data=train_df,
                  order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 
                         'Friday', 'Saturday', 'Sunday'],
                  palette='Reds',
                  saturation=0.9)

# Customize the plot
plt.title('Crime Distribution by Day of Week', fontsize=16, pad=20)
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Number of Crimes', fontsize=12)
plt.xticks(rotation=45, ha='right')  # Better angled labels

# Add value labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points',
                fontsize=10)

ax.yaxis.grid(True, linestyle='--', alpha=0.7)
ax.set_axisbelow(True)

plt.tight_layout()
plt.show()

In [None]:
# Create a figure with custom size 
plt.figure(figsize=(10, 10))
# Create a polar subplot 
ax = plt.subplot(polar=True)
# Extract hour from datetime and store in new column 
train_df['Hours'] = train_df['Dates'].dt.hour
# Create 24 evenly spaced angles for clock hours 
theta = np.linspace(0, 2*np.pi, 24, endpoint=False)
# Count crimes per hour and sort by hour 
counts = train_df['Hours'].value_counts().sort_index()
# Create colored bars for each hour 
bars = ax.bar(theta, counts, width=0.5, color=plt.cm.Reds(np.linspace(0.2, 0.8, len(counts))))
# Set clock orientation (12 at top) 
ax.set_theta_zero_location('N')
# Set clockwise direction 
ax.set_theta_direction(-1)
# Add title with padding 
plt.title('Crime Clock', pad=20, fontsize=16)
# Set hour labels at each angle 
plt.xticks(theta, range(24))
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Create a date-only column without time component ---> for cleaner animation
# This helps in creating daily animation frames without hourly fluctuations
train_df["DateOnly"] = train_df["Dates"].dt.date

"""
Interactive Animated Crime Map Visualization
Purpose: To visualize the spatial and temporal distribution of crimes in San Francisco

Key Features:
- Each point represents a crime incident
- Color-coded by crime category for easy differentiation
- Animated by date to show crime patterns over time
- Interactive hover information for detailed crime data
- Geographic context with proper map projection
"""

# Create the animated scatter plot on mapbox
fig = px.scatter_mapbox(
    train_df,  # Our crime dataset
    lat="Y",  # Latitude values (north-south position)
    lon="X",  # Longitude values (east-west position)
    color="Category",  # Different colors for each crime type
    animation_frame="DateOnly",  # Animate by date (creates time slider)
    hover_name="Descript",  # Show crime description on hover
    zoom=11,  # Initial zoom level (street-level view)
    
    # Center the map on downtown San Francisco coordinates
    center={"lat": 37.76, "lon": -122.43},
    
    height=600,  # Set map height in pixels
    mapbox_style="carto-positron",  # Light-themed map for clarity
    
    # Chart title
    title="Timeline of Crimes in San Francisco"
)

# Display the interactive figure
fig.show()

In [None]:
# Create sunburst chart of crime data 
fig = px.sunburst(train_df, 
                  path=['PdDistrict', 'Category', 'Resolution'],  # Hierarchy levels 
                  maxdepth=2)  # Show only 2 initial levels 
# Customize chart title 
fig.update_layout(title='Crime Resolution Flow')  

# Display the chart
fig.show()