# Analysis and Insights

## Crime Rate and Income Correlation 
* There's a negative correlation between Average Crime Rate and Average Income
* Low Income area has higher crime rate
* R value -> negative relationship.
* p-value<0.005 -> A statistically significant test result
* The stacked bar graph shows Crime Count per neighborhood for each Income Group which depicts lower income regions have higher crime rate per neighborhood.

## Red Light Cameras Analysis
* From the graphs above, we can see that the majority of red light cameras are implemented in regions with an average income of 50-100K. Comparing this with the total number of speeding tickets per income region we see that the same income region holds the most amount of speeding tickets. We can conclude from this that red light cameras are not necessarly placed in low income regions, but more-so regions where speeding is more commonplace.

## House Prices and Crime Rate Correlation
* It seems that there is a very weak negative correlation between  crime rate and house prices.

## Starbucks Locations Analysis
* The Number of Starbucks Stores vs. Average Income scatter plot shows that it seems that there is no correlation between starbucks stores location and income. 


In [4]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [5]:
final_df = pd.read_csv('Graphs and Analysis/Final_Data2.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Graphs and Analysis/Final_Data2.csv'

In [None]:
final_df.head()

## Crime and Income

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers in Average Income.

# Calculate qualtiles and IQR
qualtiles = final_df['Average_Income'].quantile([0.25, 0.5, 0.75])

first_qualtile = qualtiles[0.25]
third_qualtile = qualtiles[0.75]

iqr = third_qualtile - first_qualtile

# Calculate lower bound and upper bound
lower_bound = first_qualtile - (1.5 * iqr)
upper_bound = third_qualtile + (1.5 * iqr)

# The final dataframe without outliers
outliers_removed_df = final_df.loc[(final_df['Average_Income'] <= upper_bound) & (final_df['Average_Income'] >= lower_bound), :]
outliers_removed_df

In [None]:
#Removed outlier
sorted_df = outliers_removed_df.sort_values("Average_Income")
df = sorted_df.groupby('Income Group')['Total Average Rate'].sum()
df.plot(kind='bar', x='Income Group', y='Total Average Rate')
plt.title("Income Range vs. Total Crime Rate")
plt.xlabel("Income Range")
plt.ylabel("Total Crime Rate")
plt.xticks(rotation=90)
plt.tight_layout
plt.show()

In [None]:
## Creat linear regression plots 

income = outliers_removed_df["Average_Income"]
crime = outliers_removed_df["Total Average Rate"]

#plot
slope, int, r, p, std_err = st.linregress(income, crime)

#Create equation of line
eq_line = slope * income + int

plt.scatter(income, crime, color= 'red', label = 'actual data')
plt.plot(income, eq_line, '--')
plt.xlabel("Average Income")
plt.ylabel("Average Crime Rate")
plt.show()

print(r)
print(p)

## Crime and Income Analysis

In [None]:
#Sort the data on Income
sorted_df = final_df.sort_values("Average_Income", ascending = True)

#Count Neighborhoods for each Income group and append it to our dataframe
count_neighborhood_df = sorted_df.groupby(["Income Group"])["Neighbourhood"].count().replace(0,np.nan).dropna().astype(int)

#Plot the bar graph showing count of neighborhoods for each Income Range
count_neighborhood_df.plot(kind ='bar',color = 'crimson',figsize=(12,7))

# Set labels for axes
plt.xticks(rotation=90,fontsize = 13)
plt.xlabel("Income Group",fontsize = 15)
plt.ylabel("Neighborhood Count",fontsize = 15)
plt.title("Neighborhood Count for each Income Group",fontsize = 18)

#Display resulting plot
plt.tight_layout()
plt.show()

In [None]:
#Fetch sum of each crime type for each Income Range
grouped_df =final_df.groupby(["Income Group"]).agg(
    {
         'Assault Rate':sum,    
         'Auto Theft Rate': sum,  
         'Break&Enter Rate': sum , 
        'Homicide Rate': sum,
        'Robberey Rate': sum,
        'Theft Over Rate': sum
    }
)

# Count Neighborhoods for each Income group and append it to our dataframe
count_neighborhood_df = final_df.groupby(["Income Group"]).count()["Neighbourhood"] 
grouped_df["Count_Neighborhood"] = count_neighborhood_df

#Drop row if it has NaN values for all columns
grouped_df=grouped_df.replace(0,np.nan).dropna(thresh=6).fillna(0)

#Divide each crime rate with Neighborhood count so as to find values per neighborhood
grouped_df["Assault Rate"] = (grouped_df["Assault Rate"])/(grouped_df["Count_Neighborhood"])
grouped_df["Auto Theft Rate"] = (grouped_df["Auto Theft Rate"])/(grouped_df["Count_Neighborhood"])
grouped_df["Break&Enter Rate"] = (grouped_df["Break&Enter Rate"])/(grouped_df["Count_Neighborhood"])
grouped_df["Homicide Rate"] = (grouped_df["Homicide Rate"])/(grouped_df["Count_Neighborhood"])
grouped_df["Robberey Rate"] = (grouped_df["Robberey Rate"])/(grouped_df["Count_Neighborhood"])
grouped_df["Theft Over Rate"] = (grouped_df["Theft Over Rate"])/(grouped_df["Count_Neighborhood"])

#Plot the bar graph showing crime count per neighborhood for each Income Range
colors = ['purple','gold','crimson','darkgoldenrod','plum','royalblue']
grouped_df.plot(kind ='bar', stacked = True,color = colors,figsize=(12,7))

# Set labels for axes
plt.xticks(rotation=90,fontsize = 13)
plt.xlabel("Income Group",fontsize = 15)
plt.ylabel("Crime Count per Neighborhood",fontsize = 15)
plt.title("Crime Count per Neighborhood for each Income Group",fontsize = 18)

#Display resulting plot
plt.tight_layout()
plt.show()

## Red Light Cameras Analysis

In [None]:
#display income & red lights plot
sorted_df = final_df.sort_values("Average_Income")
df1 = sorted_df.groupby('Income Group', sort=False)['Number of Red Lights'].sum()
df1.plot(kind='bar', x='Income Group', y='Number of Red Lights')
plt.title("Income Range vs. Total Red Lights")
plt.xlabel("Income Range")
plt.ylabel("Total Red Lights")
plt.xticks(rotation=90)
plt.tight_layout
plt.show()

In [None]:
#display income & speeding plot
df2 = sorted_df.groupby('Income Group', sort = False)['Speeding'].sum()
df2.plot(kind='bar', x='Income Group', y='Speeding Count')
plt.title("Income Range vs. Speeding Count")
plt.xlabel("Income Range")
plt.ylabel("Total Speeding Counts")
plt.xticks(rotation=90)
plt.tight_layout
plt.show()

## Starbucks Locations Analysis

In [None]:
# Create a function to calculate regression values and creat linear regression plots
def linear_regression_plot(x_values, y_values, x_label, y_label, annotate_point):
    
    mask = ~np.isnan(x_values) & ~np.isnan(y_values)
    slope, intercept, r, p, std_err = st.linregress(x_values[mask], y_values[mask])
    
    # Regression values
    regression_values = slope * x_values + intercept
    
    # Create a equation of line    
    eq_line = f"{round(slope, 2)}x + {round(intercept, 2)}"

    # Create a plot
    plt.figure(figsize=(8, 6))
    plt.scatter(x_values, y_values, color='green', alpha=0.5)
    
    plt.plot(x_values, regression_values, color='green')
    plt.annotate(eq_line, annotate_point, color='green', fontsize=16)
    
    plt.title(f'{y_label} vs. {x_label}', fontsize=15)
    plt.xlabel(f'{x_label}', fontsize=12)
    plt.ylabel(f'{y_label}', fontsize=12)
               
    plt.grid()

    # Display the figures
    plt.show()
    
    print(f'The correlation coefficient is {round(r, 5)}.')
    print(f'p value is {round(p, 5)}.')
    print(f'r square value is {round(r**2, 5)}.')

In [None]:
# final_df variables (did not exclude outliers)

final_avg_income = final_df['Average_Income']
final_med_income = final_df['Median_Income']
final_sb = final_df['Number of Starbucks Stores']
final_crime = final_df['Total Average Rate']
final_home_prices = final_df['Home Prices']
final_population = final_df['Population, 2016']
final_red_lights = final_df['Number of Red Lights']
final_speeding = final_df['Speeding']

starbucks_label = 'Total Number of Starbucks Stores'
avg_income_label = 'Average Income'
med_income_label = 'Median Income'
home_prices_label = 'House Prices'
population_label = 'Population'
crime_label = 'Total Average Crime Rate'
red_lights_label = 'Total Red Lights'
speeding_label = 'Total Spedding Counts'

In [None]:
# final_df - average income vs. starbucks stores

linear_regression_plot(final_avg_income, final_sb, avg_income_label, starbucks_label, (300000, 5))

## House Prices and Crime Rates

In [None]:
linear_regression_plot(final_home_prices, final_crime, home_prices_label, crime_label, (1000000, 500))