In [None]:
# Importing pre-requisite libraries for the project

import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import re
from itertools import cycle
import traceback
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
import os
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import PyPDF2, io
import numpy as np

from dotenv import load_dotenv


# Setting up environment variables

load_dotenv()

In [None]:
# Initiate Selenium Framework (Chromium Headless Browser)

def initialize_chromium():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-infobars")
    options.add_argument("--start-maximized")
    options.add_argument("--disable-extensions")
    # options.add_argument('--window-size=764,1080')
    options.add_argument('--window-size=1920,1080')
    # options.add_argument("--headless")
    global driver; driver = webdriver.Chrome(options = options)


# Release Selenium Framework (Chromium Headless Browser)

def release_chromium():
    driver.close()

In [None]:
# Extracting the Regular Meetings data to a DataFrame

def meetings_extraction():
  # url = 'https://ottawapoliceboard.ca/opsb-cspo/meetings.html'
  # url = 'https://pub-ottawa.escribemeetings.com/?Expanded=Ottawa%20Police%20Services%20Board&Year=2023'
  # url = 'https://pub-ottawa.escribemeetings.com/Meeting.aspx?Id=3760766e-baa1-4574-a3c3-da5e7879c8c6&Agenda=Agenda&lang=English'

  global current_year; current_year = 2022
  meeting_no = 1
  reg_meets_all = []
  reg_meets = []
  reg_meets_cancelled = []
  reg_meets_without_verbals = []

  while current_year > 2010:
    url = f'https://ottawapoliceboard.ca/opsb-cspo/{current_year}-meetings.html'

    driver.get(url)
    page_src = driver.page_source
    soup = BeautifulSoup(page_src, 'html.parser')

    meetings_list = soup.find('table').find('tbody').find_all('tr')

    for index, tr in enumerate(reversed(meetings_list)):
        td = tr.find_all('td')
        if td[3].get_text().strip() == 'Regular Meeting':
          if current_year < 2013: date = f'{td[0].get_text().strip()}, {current_year}'
          else: date = td[0].get_text().strip()

          # driver.get(td[0].find('a').get('href'))
          # page_src = driver.page_source

          reg_meets_all.append({
              'Meeting #': meeting_no,
              'Date': date,
              'Location': td[1].get_text().strip(),
              'Time': td[2].get_text().strip(),
              'Meeting Type': td[3].get_text().strip(),
              'Meeting Page': td[0].find('a').get('href'),
              'Chief Verbal Report Present': '',
              'Verbal Report File URL': ''
            })
          meeting_no = meeting_no + 1
    print(f'Regular Meetings of Year {current_year} processed.')
    current_year = current_year - 1
  reg_meets_all_df = pd.DataFrame(reg_meets_all)

# Remove Cancelled Regular Meetings
# soup.find(lambda tag: tag.name == 'div' and 'I CAN NOT GET THIS' in tag.get_text())
  for x in reg_meets_all:
    if x['Location'] == 'CANCELLED':
      reg_meets_cancelled.append(x)
    else:
      reg_meets.append(x)
  reg_meets_cancelled_df = pd.DataFrame(reg_meets_cancelled)
  # reg_meets_df = pd.DataFrame(reg_meets)

  for meet in reg_meets:
    # initialize_chromium()

    url = meet['Meeting Page']
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri = urlparse(url))
    driver.get(url)
    page_src = driver.page_source
    soup = BeautifulSoup(page_src, 'html.parser')


    elems = soup.find_all(text = lambda t: t and any(x in t for x in ["Chief’s verbal report", "Chief's verbal report", "CHIEF’S VERBAL REPORT", "CHIEF'S VERBAL REPORT"]))
    for el in elems:
      if ((el.find_parent()).find_parent()).find_parent().find_all('a', href = lambda href: href and "filestream" in href):
        urls = ((el.find_parent()).find_parent()).find_parent().find_all('a', href = lambda href: href and "filestream" in href)
        if url:
          for a in urls:
            # if "filestream" in a['href']:
              verbal_report_url = f"{domain}{a['href']}"
              meet.update({'Chief Verbal Report Present': 'Yes'})
              meet.update({'Verbal Report File URL': verbal_report_url})
        else:
          meet.update({'Chief Verbal Report Present': 'No'})
          meet.update({'Verbal Report File URL': '-'})
      elif (((el.find_parent()).find_parent()).find_parent()).find_parent().find_all('a', href = lambda href: href and "filestream" in href):
        urls = (((el.find_parent()).find_parent()).find_parent()).find_parent().find_all('a', href = lambda href: href and "filestream" in href)
        if urls:
          for a in urls:
            if "filestream" in a['href']:
              verbal_report_url = f"{domain}{a['href']}"
              meet.update({'Chief Verbal Report Present': 'Yes'})
              meet.update({'Verbal Report File URL': verbal_report_url})
            else:
              meet.update({'Chief Verbal Report Present': 'No'})
              meet.update({'Verbal Report File URL': '-'})
        else:
          meet.update({'Chief Verbal Report Present': 'No'})
          meet.update({'Verbal Report File URL': '-'})


  reg_meets_df = pd.DataFrame(reg_meets)
  export_to_excel(reg_meets_df)


# # Check for Police Chief Verbals from the meetings list
#   for x in reg_meets:
#     url = f"{x['Meeting Page']}"
#     driver.get(url)
#     page_src = driver.page_source
#     soup = BeautifulSoup(page_src, 'html.parser')

    # chief_verbal_attachment = soup.find('table').find('tbody').find_all('tr')

# Export the DataFrame to an Excel Sheet

def export_to_excel(df):
  df.to_excel('regular_meetings_v2.3.xlsx', sheet_name = 'Regular Meetings')

In [None]:
initialize_chromium()

reports = []

# url = f"https://pub-ottawa.escribemeetings.com/Meeting.aspx?Id=791a957c-85f4-44b8-86a2-ce2ca3ea42c1&Agenda=Agenda&lang=English"
url = f"https://pub-ottawa.escribemeetings.com/Meeting.aspx?Id=83b599bc-563c-6679-4a1e-713cfb4d00fb&Agenda=Agenda&lang=English"
domain = '{uri.scheme}://{uri.netloc}/'.format(uri = urlparse(url))
driver.get(url)
# driver.find_elements(by = "xpath", value = '''//*[text() = "Chief's report"]''')
page_src = driver.page_source
soup = BeautifulSoup(page_src, 'html.parser')

# for a in soup.find_all('div', text = re.compile("Chief's report"))
# elems = soup.find_all('div', text = re.compile("Chief’s report"))
elems = soup.find_all(text = lambda t: t and any(x in t for x in ["Chief’s verbal report", "Chief's verbal report", "CHIEF’S VERBAL REPORT", "CHIEF'S VERBAL REPORT"]))
for el in elems:
    # for a in el.find_parent().find_all('a', href = True):
    print(((el.find_parent()).find_parent()).find_parent())
    # print(el.find_parent())
    for a in ((el.find_parent()).find_parent()).find_parent().find_all('a', href = lambda href: href and "filestream" in href):
        # if "filestream" in a['href']:
            reports.append(domain + a['href'])
print(reports)
    # for a in ((el.find_parent()).find_parent()).find_parent().find_all('a', href = True):
    #     print ('here')
    #     if "filestream" in a['href']:
    #         print(a['href'])
# for x in driver.find_elements(by = "xpath", value = '''//*[text() = "Chief's report"]'''):
#     soup = BeautifulSoup(x.parent.page_source, 'html.parser')
#     print(soup)
# driver.find_element(by = "xpath", value = '''//*[contains(text(), "Chief's verbal report")]''').click()

# page_src = driver.page_source
# soup = BeautifulSoup(page_src, 'html.parser')

In [None]:
def send_mail():
    recipient_mail = input("Enter an E-mail address to receive missing chief's verbal list")
    to_mail = recipient_mail
    gmail_usr = os.getenv('GMAIL_USR')
    gmail_pwd = os.getenv('GMAIL_PWD')

    msg = MIMEMultipart()
    msg['Subject'] = "Missing Chief's Verbals List"
    msg['From'] = gmail_usr
    msg['To'] = to_mail

    msgText = MIMEText("\n This mail consists of regular meetings list where Chief's Verbal document is missing \n p.f.a. \n", 'html')
    msg.attach(msgText)

    filename = "regular_meetings_v2.0.xlsx"
    xlsx = MIMEApplication(open(filename, 'rb').read())
    xlsx.add_header('Content-Disposition', 'attachment', filename = 'missing_chief_verbals.xlsx')
    msg.attach(xlsx)

    # with open(filename, 'rb') as f:
    #     file_data = f.read()
    # smtpserver.add_attachment(file_data, maintype="application", subtype="xlsx", filename='regular_meetings.xlsx')

    try:
        with smtplib.SMTP("smtp.gmail.com", 587) as smtpserver:
            smtpserver.ehlo()
            smtpserver.starttls()
            smtpserver.login(gmail_usr, gmail_pwd)
            smtpserver.sendmail(f'{gmail_usr}<Team-8 SMTP Mail Client>', to_mail, msg.as_string())
            smtpserver.quit()
    except Exception as e:
        print(e)

    print ('email sent!')

In [None]:
def initialize_analysis():
    global crimes_data_df
    crimes_data_df = pd.DataFrame()
    # Read the Excel file
    excel_file = 'regular_meetings_v2.3.xlsx'
    reports_df = pd.read_excel(excel_file)

    # Assuming the column containing PDF links is named 'pdf_urls'
    reports_df = reports_df.dropna()
    pdf_urls = reports_df[['Date', 'Verbal Report File URL']]

    print(pdf_urls)
    # Read each PDF
    # for pdf_url in pdf_urls:
    for index, row in pdf_urls.iterrows():
        resp = requests.get(row['Verbal Report File URL'])
        with io.BytesIO(resp.content) as file:
            # Create a PDF object
            pdf = PyPDF2.PdfReader(file)
            
            # Initialize a variable to store the extracted text
            global corpus
            corpus = ""
            
            # Extract the text from each page of the PDF. We have only one page.
            for page in pdf.pages:
                corpus += page.extract_text()
            
        print("link started: ", index, row['Verbal Report File URL'])
        sentiment_analyzer(corpus, row['Date'])
        crimes_data_df = crimes_data_df.append(crime_df)
        print("link over: ", index, row['Verbal Report File URL'])
        print(crimes_data_df)
        # Print the extracted text
        # print(corpus)
    export_excel()
    print("Crime Analysis Completed! Please check the output file: 'crime_sentiment_analysis.xlsx' for the output.")
 
def sentiment_analyzer(corpus, date):
    corpus = corpus.replace("\n", " ")
    corpus = corpus.lower()
    tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    sentences = tokenizer.tokenize(corpus)
    # print(sentences)

    analyzer = SentimentIntensityAnalyzer()
    crime = []
    not_crime = []

    # Loop through the sentences and determine the sentiment score
    for sentence in sentences:
        # Get the sentiment score for the sentence
        sentiment_score = analyzer.polarity_scores(sentence)
        
        # Determine if the sentence has a negative sentiment (indicating a crime)
        if sentiment_score['neg'] > sentiment_score['pos']:
            crime.append(sentence)
        else:
            not_crime.append(sentence)
    # print(crime)
    global crime_df
    crime_df = pd.DataFrame({'sentences': crime})
    crime_df.insert(0, 'Date', date)

    # apply the function to the DataFrame for contains_crime()
    crime_df['contains_crime'] = crime_df['sentences'].apply(contains_crime)
    crime_df = crime_df[crime_df['contains_crime'] == True]
    print(crime_df)

    #crime_df["num_people"] = crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int) for detect_num_people() and detect_crime_type()
    crime_df.loc[:, "num_people_involved"] = crime_df["sentences"].apply(detect_num_people).replace(np.nan, 1).astype(int)
    crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)

    # Add a new column to the DataFrame with the detected crime type
    #crime_df.loc[:, "crime_type"] = crime_df["sentences"].apply(detect_crime_type)


# function to detect if a sentence contains a crime
def contains_crime(sentence):
    # load the pre-trained model
    nlp = spacy.load("en_core_web_lg")
    # nlp = spacy.load("en_core_web_sm")

    # example DataFrame
    # crime_df = pd.DataFrame(crime_df)

    # apply NER to the sentence
    crime_words = ['homicide', 'murder', 'kill', 'sexual', 'assault', 'drug', 'shotgun', 'rob', 'criminal', 'charge', 'rape', 'violence', 'attack', 'sexual assault', 'robbery', 'shoot', 'gun']
    # crime_tokens = [nlp(word) for word in crime_words]
    # crime_vectors = np.vstack([token.vector for token in crime_tokens])
    doc = nlp(sentence)
    # check for entities labelled as "CRIME" or "LAW"
    for ent in doc.ents:
        
        if ent.label_ in ['CRIME', 'LAW', 'MURDER', 'PERSON', 'WEAPON', 'MONEY', 'GUN', 'CRIMINAL CHARGES', 'NUMBERS']:
            return True
    # check for POS tags indicating a violent crime
    for token in doc:
        if token.pos_ == 'VERB' and token.lemma_ in crime_words:
            return True
    for token in doc:
        for crime_word in crime_words:
            if token.similarity(nlp(crime_word)) > 0.6:
                return True
        return False

    # similarities = np.dot(crime_vectors, doc.vector.T) / (np.linalg.norm(crime_vectors) * np.linalg.norm(doc.vector))
    # if np.any(similarities > 0.6):
    #     return True
    # return False


# Define a function to detect the type of crime in a sentence
def detect_crime_type(sentence):
    pd.options.mode.chained_assignment = None
    homicide_regex = re.compile(r"(murder|killing|death|homicide|manslaughter|Guns|shooting|died|offenders|firearm|shots|fired|shoot)", re.IGNORECASE)
    stabbing_regex = re.compile(r"(stabbing|stabbed|knife attack|knife)", re.IGNORECASE)
    police_regex = re.compile(r"(assaulted the officer)", re.IGNORECASE)
    stolen_regex = re.compile(r"(Theft|stolen|fraud|possesion)", re.IGNORECASE)
    drug_regex = re.compile(r"(drug|marijuana|substances|cannabis|products|narcotics|overdosing|overdosed)", re.IGNORECASE)
    driving_regex = re.compile(r"(stunt|driving|licence|demerit|fined|car)", re.IGNORECASE)
    hateful_regex = re.compile(r"(hateful|hate|speech)", re.IGNORECASE)
    assault_regex = re.compile(r"(sexual|sexual assault|harassment|harassing|abusing|abuse|threatening|fighting|rape)", re.IGNORECASE)


    if re.search(homicide_regex, sentence):
        return "homicide"
    elif re.search(stabbing_regex, sentence):
        return "stabbing"
    elif re.search(police_regex, sentence):
        return "total assaults against a peace officer"
    elif re.search(stolen_regex, sentence):
        return "total possession of stolen property"
    elif re.search(drug_regex, sentence):
        return "drug violations"
    elif re.search(driving_regex, sentence):
        return "driving violations"
    elif re.search(hateful_regex, sentence):
        return "speech violations"
    elif re.search(assault_regex, sentence):
        return "assault and harrassment"
    else:
        return "Unknown"

def detect_num_people(text):
    pattern = r"\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten)\b(?=\s*(?:for\s)?(?:criminals?|illicit?|robberies?|arrests?|suspects?|offenders?|men|people|individuals|stunt|criminal|charges))"
    match = re.search(pattern, text)
    if match:
        num_str = match.group(1)
        if num_str.isdigit():
            num = int(num_str)
            if num > 100: # Assume it's an age
                return None
            elif num >= 10 and 'old' in text: # Assume it's an age
                return None
            else:
                return num
        elif num_str == 'one':
            return 1
        elif num_str == 'two':
            return 2
        elif num_str == 'three':
            return 3
        elif num_str == 'four':
            return 4
        elif num_str == 'five':
            return 5
        elif num_str == 'six':
            return 6
        elif num_str == 'seven':
            return 7
        elif num_str == 'eight':
            return 8
        elif num_str == 'nine':
            return 9
        elif num_str == 'ten':
            return 10
    else:
        return None
      

# Print the modified DataFrame
def export_excel():
    print(crimes_data_df)
    output_path = 'crime_sentiment_analysis.xlsx'
    crimes_data_df.to_excel(output_path, index = False)

In [None]:
# initialize_chromium()
# meetings_extraction()
# release_chromium()
# send_mail()
initialize_analysis()