In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import re

# load dataset
data = pd.read_excel('manually_tagged_sentences.xlsx')
data.dropna(inplace=True)

# split dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)

# vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
train_vectors = vectorizer.fit_transform(train_data['sentences'])
test_vectors = vectorizer.transform(test_data['sentences'])

# train the SVM model on the vectorized data
model = LinearSVC()
model.fit(train_vectors, train_data['type'])

# predict the labels of the test set
predictions = model.predict(test_vectors)

# evaluate the accuracy of the model
accuracy = accuracy_score(test_data['type'], predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7317073170731707


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import joblib
from openpyxl import load_workbook

# load dataset
data = pd.read_excel('manually_tagged_sentences.xlsx')
#data['type'] = data['type'].fillna(value='NEITHER')
data.dropna(inplace=True)

# split dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.4, random_state=42)

# vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
train_vectors = vectorizer.fit_transform(train_data['sentences'])
test_vectors = vectorizer.transform(test_data['sentences'])

# train the SVM model on the vectorized data
model = LinearSVC()
model.fit(train_vectors, train_data['type'])

# save the trained model to a file
joblib.dump(model, 'svm_model.sav')

# load the trained model from the file
loaded_model = joblib.load('svm_model.sav')

# load the new data to make predictions on
new_data = pd.read_excel('Final all_extracted_sentences.xlsx')
new_data.dropna(inplace=True)

# vectorize the new data
new_vectors = vectorizer.transform(new_data['sentences'])

# predict the labels of the new data using the loaded model
new_predictions = loaded_model.predict(new_vectors)

# add the predictions to the new_data dataframe
new_data['type'] = new_predictions

# append the new_data dataframe to the existing excel file using the openpyxl engine
with pd.ExcelWriter('Final all_extracted_sentences.xlsx', engine='openpyxl', mode='a') as writer:
    writer.book = load_workbook('Copy of crime_sentiment_analysis final data.xlsx')
    writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
    new_data.to_excel(writer, sheet_name='crime_sentiment_testing.xlsx', index=False, header=False)


In [3]:
#importing test dataset
data = pd.read_excel('Final all_extracted_sentences.xlsx',sheet_name=1)
data.columns = ['date', 'sentences', 'type']
print(data.columns)

Index(['date', 'sentences', 'type'], dtype='object')


In [9]:
print(data)

                    date                                          sentences  \
0     September 27, 2021   and miigwech/thank you to grandmother irene f...   
1     September 27, 2021   the peoples of the algonquin anishinabe natio...   
2     September 27, 2021   their culture and presence have nurtured and ...   
3     September 27, 2021   our service honours all first nations, inuit ...   
4     September 27, 2021   in june of this year, the federal government ...   
...                  ...                                                ...   
3751    January 26, 2015   inspector john mcgetrick and i will be plungi...   
3752    January 26, 2015   i encourage board members to come out to the ...   
3753    January 26, 2015    i am happy to answer any questions you may have   
3754    January 26, 2015                                          thank you   
3755    January 26, 2015                                                      

                 type  
0     COUNTER MEASURE  
1  

In [10]:
crime_df = data[data['type'] == 'CRIME']

In [11]:
#regex for crime 
homicide_regex = re.compile(r"(murder|killing|death|homicide|manslaughter|Guns|shooting|died|offenders|firearm|shots|fired|shoot)", re.IGNORECASE)
stabbing_regex = re.compile(r"(stabbing|stabbed|knife attack|knife)", re.IGNORECASE)
police_regex = re.compile(r"(assaulted the officer)", re.IGNORECASE)
stolen_regex = re.compile(r"(Theft|stolen|fraud|possesion)", re.IGNORECASE)
drug_regex = re.compile(r"(drug|marijuana|substances|cannabis|products|narcotics|overdosing|overdosed)", re.IGNORECASE)
driving_regex = re.compile(r"(stunt|driving|licence|demerit|fined|car)", re.IGNORECASE)
hateful_regex = re.compile(r"(hateful|hate|speech)", re.IGNORECASE)
assault_regex = re.compile(r"(sexual|sexual assault|harassment|harassing|abusing|abuse|threatening|fighting|rape)", re.IGNORECASE)




#function to detect the type of crime in a sentence
def detect_crime_type(sentence):
    if re.search(homicide_regex, sentence):
        return "homicide"
    elif re.search(stabbing_regex, sentence):
        return "stabbing"
    elif re.search(police_regex, sentence):
        return "total assaults against a peace officer"
    elif re.search(stolen_regex, sentence):
        return "total possession of stolen property"
    elif re.search(drug_regex, sentence):
        return "drug violations"
    elif re.search(driving_regex, sentence):
        return "driving violations"
    elif re.search(hateful_regex, sentence):
        return "speech violations"
    elif re.search(assault_regex, sentence):
        return "assault and harrassment"
    else:
        return "Unknown"

def detect_num_people(text):
    pattern = r"\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\b(?=\s*(?:for\s)?(?:criminals?|illicit?|robberies?|arrests?|suspects?|offenders?|men|people|individuals|stunt|criminal|charges))"
    match = re.search(pattern, text)
    if match:
        num_str = match.group(1)
        if num_str.isdigit():
            num = int(num_str)
            if num > 100: # Assume it's an age
                return None
            elif num >= 10 and 'old' in text: # Assume it's an age
                return None
            else:
                return num
        elif num_str == 'one':
            return 1
        elif num_str == 'two':
            return 2
        elif num_str == 'three':
            return 3
        elif num_str == 'four':
            return 4
        elif num_str == 'five':
            return 5
        elif num_str == 'six':
            return 6
        elif num_str == 'seven':
            return 7
        elif num_str == 'eight':
            return 8
        elif num_str == 'nine':
            return 9
        elif num_str == 'ten':
            return 10
    else:
        return None


crime_df.loc[:, "num_people_involved"] =crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int)
crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)


# Print the modified DataFrame
print(crime_df)

                    date                                          sentences  \
13    September 27, 2021   shirley is a member of the mohawk first natio...   
14    September 27, 2021   she was the officer in charge of the rcmp’s r...   
24    September 27, 2021   this week we will host a “badge ceremony” in ...   
30    September 27, 2021                            40% identify as women 2   
43    September 27, 2021   i look forward to seeing members of the board...   
...                  ...                                                ...   
3732    January 26, 2015   the canine unit also attended the scene and l...   
3733    January 26, 2015   trevor pratt, age 30, of ottawa, is charged w...   
3739    January 26, 2015   this includes the official launch on january ...   
3745    January 26, 2015   the event takes place at carleton university ...   
3750    January 26, 2015   the largest awareness and funding drive leadi...   

       type  num_people_involved          crime_typ

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_df.loc[:, "num_people_involved"] =crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)


In [12]:
crime_df = crime_df.drop('type', axis=1)
print(crime_df)

                    date                                          sentences  \
13    September 27, 2021   shirley is a member of the mohawk first natio...   
14    September 27, 2021   she was the officer in charge of the rcmp’s r...   
24    September 27, 2021   this week we will host a “badge ceremony” in ...   
30    September 27, 2021                            40% identify as women 2   
43    September 27, 2021   i look forward to seeing members of the board...   
...                  ...                                                ...   
3732    January 26, 2015   the canine unit also attended the scene and l...   
3733    January 26, 2015   trevor pratt, age 30, of ottawa, is charged w...   
3739    January 26, 2015   this includes the official launch on january ...   
3745    January 26, 2015   the event takes place at carleton university ...   
3750    January 26, 2015   the largest awareness and funding drive leadi...   

      num_people_involved          crime_type  
13 

In [14]:
crime_df.drop(crime_df.loc[crime_df['crime_type'] == 'Unknown'].index, inplace=True)

In [15]:
print(crime_df)

                    date                                          sentences  \
14    September 27, 2021   she was the officer in charge of the rcmp’s r...   
48    September 27, 2021   stunt driving offences doubled during last ye...   
51    September 27, 2021   in a brazen act of criminal violence , the ma...   
58    September 27, 2021   the man was charged with attempted murder, po...   
59    September 27, 2021   the security guard suffered non -life-threate...   
...                  ...                                                ...   
3679   February 23, 2015   the suspect left his friend’s room and entere...   
3681   February 23, 2015   he has been charged with sexual assault and b...   
3706    January 26, 2015   that investigation lead to a search warrant o...   
3708    January 26, 2015   arrest on attempted murder charges this past ...   
3745    January 26, 2015   the event takes place at carleton university ...   

      num_people_involved               crime_type 

In [16]:
#export to excel as final dataset
output_path = 'Final crime_analyzer_output.xlsx'
crime_df.to_excel(output_path, index=False)