# Data Analysis 


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Important: You need to save your file as py (jupiter notebook --> file-->download as py )
from cleanData import DataPreparationCleaning

class DataAnalysis:
    def __init__(self, file):
        # Create instance of class DataPreparationCleaning - yes this also works 
        # This means that ypu can now get access to all the attributes and methods of the class
        self.dataPreparation = DataPreparationCleaning(file) # Instance
        self.preparedData = self.dataPreparation.finalDataframe # Tell the instance to get the finalDataframe and save it in self.preparedData
        
    def performAnalysis(self):
        self.totalCrimeCount = self.totalCrimeCount(self.preparedData)
        self.crimeCount_byArea = self.crimeCount(self.preparedData) # crime counts total per area
        self.specificCrimeCount_byArea = self.specificCrimeCount(self.preparedData) # 15 highest crime cases per area
        self.crimeCountsVictSex = self.victSexCrimeCount(self.preparedData)
        self.crimeCountsVictSex_byArea = self.victSexCrimeCountbyArea(self.preparedData)
        self.crimeCount_byTimeOfDay = self.crimeCountByTimeOfDay(self.preparedData)
        self.monthly_crime_counts = self.crimeCasesByMonth(self.preparedData)
        self.crimeCasesGenderMonth = self.crimeCasesGenderMonth(self.preparedData)
        
        print(self.crimeCasesGenderMonth)
        
    def saveDataFrames(self):
        self.crimeCount_byArea.to_csv('crimeCount_byArea.csv', index=False)
        self.specificCrimeCount_byArea.to_csv('specificCrimeCount_byArea.csv', index=False)
        self.crimeCountsVictSex.to_csv('crimeCountsVictSex.csv', index=False)
        self.crimeCountsVictSex_byArea.to_csv('crimeCountsVictSex_byArea.csv', index=False)
        self.crimeCount_byTimeOfDay.to_csv('crimeCount_byTimeOfDay.csv', index=False)
        self.monthly_crime_counts.to_csv('monthly_crime_counts.csv', index=False)
        self.crimeCasesGenderMonth.to_csv('crimeCasesGenderMonth.csv', index=False)

    def totalCrimeCount(self, dataframe):
        totalCrimeCases = dataframe['Crm Cd Desc'].count()
        return totalCrimeCases

    def crimeCasesByMonth(self, dataframe):
        dataframe['Month'] = dataframe['DATE OCC'].dt.month
        monthly_crime_counts = dataframe.groupby('Month')['Crm Cd Desc'].count().reset_index(name='Crime Count')
        return monthly_crime_counts
    
    def crimeCasesGenderMonth(self, dataframe):
        dataframe['Monat'] = dataframe['DATE OCC'].dt.strftime('%B')
        crimeCount_byGenderAndMonth = dataframe.groupby(['Vict Sex', 'Monat'])['Crm Cd Desc'].count().reset_index(name='total_count')
        crimeCount_byGenderAndMonth = crimeCount_byGenderAndMonth.sort_values(by=['Vict Sex', 'Monat'])
        return crimeCount_byGenderAndMonth
    
    def crimeCount(self, dataframe):
        crimeCount_byArea = dataframe.groupby('AREA NAME')['Crm Cd Desc'].count().reset_index(name='total_count')
        crimeCount_byArea = crimeCount_byArea.sort_values(by='total_count', ascending=False)
        return crimeCount_byArea

    def crimeCountByTimeOfDay(self, dataframe):
        bins = [0, 600, 1200, 1800, 2400]
        labels = ['Morning', 'Afternoon', 'Evening', 'Night']
        time_of_day = pd.cut(dataframe['TIME OCC'], bins=bins, labels=labels, right=False)
        dataframe['TimeOfDay'] = pd.Categorical(time_of_day, categories=['Morning', 'Afternoon', 'Evening', 'Night'], ordered=True)
        crimeCount_byTimeOfDay = dataframe.groupby(['AREA NAME', 'TimeOfDay']).size().reset_index(name='count')
        return crimeCount_byTimeOfDay.sort_values(['AREA NAME', 'TimeOfDay'])

    def specificCrimeCount(self, dataframe):
        specificCrimeCount_byArea = dataframe.pivot_table(index='Crm Cd Desc', columns='AREA NAME', aggfunc='size', fill_value=0)
        tota_count = specificCrimeCount_byArea.sum(axis=1) # total sum per row
        top_15_counts = tota_count.sort_values(ascending=False).head(15).index # finding 15 highest crime counts
        top15_specificCrimesCounts_byArea = specificCrimeCount_byArea.loc[top_15_counts]
        return top15_specificCrimesCounts_byArea
    
    def victSexCrimeCount(self, dataframe):
        crimeByVictSex = dataframe.groupby('Vict Sex')['Crm Cd Desc'].count().reset_index(name='total_count')
        return crimeByVictSex
    
    def victSexCrimeCountbyArea(self, dataframe):
        crimeByVictSex_byArea = dataframe.pivot_table(index='Crm Cd Desc', columns='Vict Sex', aggfunc='size', fill_value=0)
        total_counts = crimeByVictSex_byArea.sum(axis=1)
        top_15_counts = total_counts.head(15).index
        crimeByVictSex_byArea = crimeByVictSex_byArea.loc[top_15_counts]
        return crimeByVictSex_byArea
        
def main():
    file = 'Crime_Data_from_2020_to_Present.csv'
    analysisApp = DataAnalysis(file)
    analysisApp.performAnalysis()
    analysisApp.saveDataFrames()
main()


  dataFrame[column] = pd.to_datetime(dataFrame[column], errors='coerce')


   Vict Sex      Monat  total_count
0         F      April        25027
1         F     August        20148
2         F   December        19538
3         F   February        25449
4         F    January        26738
5         F       July        20669
6         F       June        25606
7         F      March        25921
8         F        May        26056
9         F   November        19110
10        F    October        20555
11        F  September        19636
12        H      April            8
13        H     August            9
14        H   December            6
15        H   February            4
16        H    January           13
17        H       July            5
18        H       June            6
19        H      March            5
20        H        May            7
21        H   November           11
22        H    October            6
23        H  September            4
24        M      April        26286
25        M     August        21240
26        M   December      