In [5]:
import pandas as pd

# Read Crimes CSV file into a pandas dataframe for manipulation in python
df = pd.read_csv('Crime_Data_Set_Cleaned.csv')

# Select only two columns ('Location Description' and 'Primary Type') from the DataFrame df and assign it to a new DataFrame called selected_columns_df
selected_columns_df = df[['Location Description', 'Primary Type']]

# We create a function (remove_duplicates) that takes a row of data as input, removes any duplicates
# (after dropping any missing values), sorts the unique values, and joins them into a comma-separated string
def remove_duplicates(row):
    return ','.join(sorted(set(row.dropna())))

# Because we want the types of crimes that occur at a given location, we group the selected columns DataFrame by 'Location Description',
# and apply the remove_duplicates function to each group, and then reset the index to create a new DataFrame melted_df with the result.
merged_df = selected_columns_df.groupby('Location Description')['Primary Type'].apply(remove_duplicates).reset_index()

# We use a function called pivot to rearrange the data, using 'Location Description' as the index, the cumulative count of occurrences as columns,
# and 'Primary Type' as the values. We then resets the index to create a new DataFrame restructured_df.
restructured_df = merged_df.pivot_table(index='Location Description', columns=merged_df.groupby('Location Description').cumcount(), values='Primary Type', aggfunc='first').reset_index()

# This creates two columns of the Location Description (index) and 0 (aggregrate primary types). We Split the values in the column ('0') of restructured_df by commas,
# creating a new DataFrame split_column_0.
split_column_0 = restructured_df[0].str.split(',', expand=True)

# We Concatenat the split columns with the rest of the DataFrame being the Location Description left
restructured_df = pd.concat([restructured_df.drop(columns=0), split_column_0], axis=1)

# We Display the resulting DataFrame
restructured_df

# Download resulting dataframe into a CSV file for RapidMiner manipultaion
restructured_df.to_csv('/preprocessed_association_rules_mining_dataset.csv', index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,ARSON,BATTERY,BURGLARY,CRIMINAL DAMAGE,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,INTERFERENCE WITH PUBLIC OFFICER,LIQUOR LAW VIOLATION,NARCOTICS,OTHER OFFENSE,...,,,,,,,,,,
1,BATTERY,DECEPTIVE PRACTICE,THEFT,,,,,,,,...,,,,,,,,,,
2,ASSAULT,BATTERY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,MOTOR VEHICLE THEFT,OTHER OFFENSE,THEFT,,,,...,,,,,,,,,,
3,ASSAULT,CONCEALED CARRY LICENSE VIOLATION,MOTOR VEHICLE THEFT,OFFENSE INVOLVING CHILDREN,OTHER OFFENSE,THEFT,,,,,...,,,,,,,,,,
4,ASSAULT,BATTERY,CRIMINAL SEXUAL ASSAULT,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,MOTOR VEHICLE THEFT,OTHER OFFENSE,PUBLIC PEACE VIOLATION,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,BATTERY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,THEFT,,,,,,,...,,,,,,,,,,
106,ASSAULT,BATTERY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,INTERFERENCE WITH PUBLIC OFFICER,PROSTITUTION,ROBBERY,SEX OFFENSE,THEFT,,...,,,,,,,,,,
107,ARSON,ASSAULT,BATTERY,BURGLARY,CONCEALED CARRY LICENSE VIOLATION,CRIMINAL DAMAGE,CRIMINAL SEXUAL ASSAULT,CRIMINAL TRESPASS,DECEPTIVE PRACTICE,INTERFERENCE WITH PUBLIC OFFICER,...,THEFT,WEAPONS VIOLATION,,,,,,,,
108,ASSAULT,BATTERY,BURGLARY,CRIMINAL DAMAGE,DECEPTIVE PRACTICE,HOMICIDE,INTERFERENCE WITH PUBLIC OFFICER,LIQUOR LAW VIOLATION,OTHER OFFENSE,PUBLIC PEACE VIOLATION,...,,,,,,,,,,
