<a href="https://colab.research.google.com/github/Vikkysai/Python/blob/main/CRIME_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Step-2: Loading the data
data = pd.read_csv(r"C:\Users\Durgam saivivek\Downloads\archive5\crime_data.csv")

In [None]:
# Step-3: Display basic information and initial rows
print(data.head())
print(data.tail())
print(data.info())
print(data.describe(include='all'))
print(data.shape)

In [None]:
# Check for missing values
print("Missing values in Data:")
print(data.isnull().sum())

In [None]:
#We have no missing values


In [None]:
#Check for duplicates
print("Duplicates in Data:")
print(data.duplicated().sum())

In [None]:
#We must drop all the duplicates
dup=data.drop_duplicates(inplace=True)
print(dup)

In [None]:
print(data.duplicated().sum())


In [None]:
crime_counts = data['Crm Cd Desc'].value_counts()
highest_crime = crime_counts.idxmax()
highest_crime_count = crime_counts.max()
print(f"The most common crime type is '{highest_crime}' with {highest_crime_count} incidents.")

In [None]:
crime_by_area = data['AREA NAME'].value_counts()
highest_crime_area = crime_by_area.idxmax()
highest_crime_area_count = crime_by_area.max()
print(f"The area with the most recorded crimes is '{highest_crime_area}' with {highest_crime_area_count} incidents.")

In [None]:
# Count of each crime type
top_n = 15
top_crimes = data['Crm Cd Desc'].value_counts().nlargest(top_n)
plt.figure(figsize=(14, 8))
sns.barplot(x=top_crimes.values, y=top_crimes.index)
plt.title(f'Top {top_n} Most Common Crime Types', fontsize=16)
plt.xlabel('Number of Incidents', fontsize=14)
plt.ylabel('Crime Type', fontsize=14)
plt.show()


In [None]:
# draw lineplot
sns.lineplot(x="Vict Age", y="Weapon Desc", data=data)

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data['Vict Age'].dropna(), bins=30, kde=True)
plt.title('Distribution of Victim Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Vict Sex', data=data)
plt.title('Gender Distribution of Victims')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(y='AREA NAME', data=data, order=data['AREA NAME'].value_counts().index)
plt.title('Crimes by Area')
plt.show()


In [None]:
# Select numeric columns for correlation analysis
numeric_data = data.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
data.hist(figsize=(12, 10), bins=30)
plt.suptitle('Histograms of Numerical Features in Crime Data')
plt.show()

In [None]:
# Get the top 10 most frequent crime types
top_crimes = data['Crm Cd Desc'].value_counts().nlargest(10).index

# Filter the data for these top crimes
filtered_data = data[data['Crm Cd Desc'].isin(top_crimes)]

# Create the box plot
plt.figure(figsize=(20,15))
sns.boxplot(x='Crm Cd Desc', y='Vict Age', data=filtered_data)
plt.title('Victim\'s Age by Top 10 Crime Types')
plt.xlabel('Crime Type')
plt.ylabel('Victim\'s Age')
plt.xticks(rotation=403)  # Rotate x labels for better readability
plt.show()
