In [None]:
import pandas as pd
import requests
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import PyPDF2, io
import re
import numpy as np

In [None]:
def initialize_analysis():
    global crimes_data_df
    crimes_data_df = pd.DataFrame()
    # Read the Excel file
    excel_file = 'regular_meetings_v2.3.xlsx'
    reports_df = pd.read_excel(excel_file)

    # Assuming the column containing PDF links is named 'pdf_urls'
    reports_df = reports_df.dropna()
    pdf_urls = reports_df[['Date', 'Verbal Report File URL']]

    print(pdf_urls)
    # Read each PDF
    # for pdf_url in pdf_urls:
    for index, row in pdf_urls.iterrows():
        if index < 15:
            resp = requests.get(row['Verbal Report File URL'])
            with io.BytesIO(resp.content) as file:
                # Create a PDF object
                pdf = PyPDF2.PdfReader(file)
                
                # Initialize a variable to store the extracted text
                global corpus
                corpus = ""
                
                # Extract the text from each page of the PDF. We have only one page.
                for page in pdf.pages:
                    corpus += page.extract_text()
                
            print("link started: ", index, row['Verbal Report File URL'])
            sentiment_analyzer(corpus, row['Date'])
            crimes_data_df = crimes_data_df.append(crime_df)
            print("link over: ", index, row['Verbal Report File URL'])
            print(crimes_data_df)
            # Print the extracted text
            # print(corpus)
        export_excel()
    else:
        print("Demo Analysis Completed! Please check the output file: 'DEMO_crime_analysis_output.xlsx' for the output.")
 
def sentiment_analyzer(corpus, date):
    corpus = corpus.replace("\n", " ")
    corpus = corpus.lower()
    tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    sentences = tokenizer.tokenize(corpus)
    # print(sentences)

    analyzer = SentimentIntensityAnalyzer()
    crime = []
    not_crime = []

    # Loop through the sentences and determine the sentiment score
    for sentence in sentences:
        # Get the sentiment score for the sentence
        sentiment_score = analyzer.polarity_scores(sentence)
        
        # Determine if the sentence has a negative sentiment (indicating a crime)
        if sentiment_score['neg'] > sentiment_score['pos']:
            crime.append(sentence)
        else:
            not_crime.append(sentence)
    # print(crime)
    global crime_df
    crime_df = pd.DataFrame({'sentences': crime})
    crime_df.insert(0, 'Date', date)

    # apply the function to the DataFrame for contains_crime()
    crime_df['contains_crime'] = crime_df['sentences'].apply(contains_crime)
    crime_df = crime_df[crime_df['contains_crime'] == True]
    print(crime_df)

    #crime_df["num_people"] = crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int) for detect_num_people() and detect_crime_type()
    crime_df.loc[:, "num_people_involved"] = crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int)
    crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)

    # Add a new column to the DataFrame with the detected crime type
    #crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)


# function to detect if a sentence contains a crime
def contains_crime(sentence):
    # load the pre-trained model
    # nlp = spacy.load("en_core_web_lg")
    nlp = spacy.load("en_core_web_sm")

    # example DataFrame
    # crime_df = pd.DataFrame(crime_df)

    # apply NER to the sentence
    crime_words = ['homicide', 'murder', 'kill', 'sexual', 'assault', 'drug', 'shotgun', 'rob', 'criminal', 'charge', 'rape', 'violence', 'attack', 'sexual assault', 'robbery', 'shoot', 'gun']
    # crime_tokens = [nlp(word) for word in crime_words]
    # crime_vectors = np.vstack([token.vector for token in crime_tokens])
    doc = nlp(sentence)
    # check for entities labelled as "CRIME" or "LAW"
    for ent in doc.ents:
        
        if ent.label_ in ['CRIME', 'LAW', 'MURDER', 'PERSON', 'WEAPON', 'MONEY', 'GUN', 'CRIMINAL CHARGES', 'NUMBERS']:
            return True
    # check for POS tags indicating a violent crime
    for token in doc:
        if token.pos_ == 'VERB' and token.lemma_ in crime_words:
            return True
    for token in doc:
        for crime_word in crime_words:
            if token.similarity(nlp(crime_word)) > 0.6:
                return True
        return False

    # similarities = np.dot(crime_vectors, doc.vector.T) / (np.linalg.norm(crime_vectors) * np.linalg.norm(doc.vector))
    # if np.any(similarities > 0.6):
    #     return True
    # return False


# Define a function to detect the type of crime in a sentence
def detect_crime_type(sentence):
    pd.options.mode.chained_assignment = None
    homicide_regex = re.compile(r"(murder|killing|death|homicide|manslaughter|Guns|shooting|died|offenders|firearm|shots|fired|shoot)", re.IGNORECASE)
    stabbing_regex = re.compile(r"(stabbing|stabbed|knife attack|knife)", re.IGNORECASE)
    police_regex = re.compile(r"(violance|violent| encounter|police officer|injured|assaulted the officer|constable)", re.IGNORECASE)
    stolen_regex = re.compile(r"(Theft|stolen|fraud|possesion)", re.IGNORECASE)
    drug_regex = re.compile(r"(drug|marijuana|substances|cannabis|products|narcotics|overdosing|overdosed)", re.IGNORECASE)
    driving_regex = re.compile(r"(stunt|driving|licence|demerit|fined|car)", re.IGNORECASE)
    hateful_regex = re.compile(r"(hateful|hate|speech)", re.IGNORECASE)
    assault_regex = re.compile(r"(sexual|sexual assault|harassment|harassing|abusing|abuse|threatening|fighting|rape)", re.IGNORECASE)


    if re.search(homicide_regex, sentence):
        return "homicide"
    elif re.search(stabbing_regex, sentence):
        return "stabbing"
    elif re.search(police_regex, sentence):
        return "total assaults against a peace officer"
    elif re.search(stolen_regex, sentence):
        return "total possession of stolen property"
    elif re.search(drug_regex, sentence):
        return "drug violations"
    elif re.search(driving_regex, sentence):
        return "driving violations"
    elif re.search(hateful_regex, sentence):
        return "speech violations"
    elif re.search(assault_regex, sentence):
        return "assault and harrassment"
    else:
        return "Unknown"

def detect_num_people(text):
    pattern = r"\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\b(?=\s*(?:for\s)?(?:criminals?|illicit?|robberies?|arrests?|suspects?|offenders?|men|people|individuals|stunt|criminal|charges))"
    match = re.search(pattern, text)
    if match:
        num_str = match.group(1)
        if num_str.isdigit():
            num = int(num_str)
            if num > 100: # Assume it's an age
                return None
            elif num >= 10 and 'old' in text: # Assume it's an age
                return None
            else:
                return num
        elif num_str == 'one':
            return 1
        elif num_str == 'two':
            return 2
        elif num_str == 'three':
            return 3
        elif num_str == 'four':
            return 4
        elif num_str == 'five':
            return 5
        elif num_str == 'six':
            return 6
        elif num_str == 'seven':
            return 7
        elif num_str == 'eight':
            return 8
        elif num_str == 'nine':
            return 9
        elif num_str == 'ten':
            return 10
    else:
        return None
      

# Print the modified DataFrame
def export_excel():
    print(crimes_data_df)
    output_path = 'DEMO_crime_analysis_output.xlsx'
    crimes_data_df.to_excel(output_path, index = False)

In [None]:
initialize_analysis()