In [6]:
import os
import re
import json
# import pyodbc
import pickle
from pprint import pprint
from cleantext import clean
import dateutil.parser as parser 
from datetime import datetime, timedelta, date

In [7]:
PATH_DATA = '../data/'
PATH_DATA_TEXT = os.path.join(PATH_DATA, 'text/')

In [28]:
class Custom_Topic:
    def __init__(self, custom_topic, keyword_score_list, total_score = 100, start_date = None, end_date = None):
        self.custom_topic = {
            'custom_topic': custom_topic.lower(),
            'keyword_score': self.get_keyword_score(keyword_score_list),
            'total_score': total_score,
            'start_date': parser.parse(start_date, dayfirst = True).date() if start_date else date.today(),
            'end_date': parser.parse(end_date, dayfirst = True).date() if end_date else start_date.replace(start_date.year + 1)
        }

    def __getitem__(self, key):
        return self.custom_topic[key]
        
    def get_keyword_score(self, keyword_score_list: list) -> list:
        keyword_score = []
        keyword_score_list_len = len(keyword_score_list)
        if (keyword_score_list_len % 2 != 0):
            print('Missing score for a keyword')
            quit()
        for i in range(0, keyword_score_list_len, 2):
            if ('(' in keyword_score_list[i]):
                with_stop_word = keyword_score_list[i].replace('(', '')
                with_stop_word = with_stop_word.replace(')', '')
                keyword_score.append((with_stop_word.lower(), keyword_score_list[i + 1]))
                without_stop_word = re.sub(r'\(.*\)', '', keyword_score_list[i]).replace('  ', ' ').strip()
                keyword_score.append((without_stop_word.lower(), keyword_score_list[i + 1]))
            else:
                keyword_score.append((keyword_score_list[i].lower(), keyword_score_list[i + 1]))
        return keyword_score

class Custom_Topics:
    def __init__(self, text_data_path = None):
        self.text_data_path = text_data_path if text_data_path else PATH_DATA_TEXT
        # self.db = Database()
        
    def add_new_custom_topic(self, custom_topic: Custom_Topic):
        values = []
        last_custom_id = self.db.write_custom_topic(custom_topic)
        for keyword_score in custom_topic['keyword_score']:
            values.append((*keyword_score, last_custom_id, 
                           datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 
                           datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        ))
        self.db.write_custom_topic_keyword(values)

    def clean_text(self, text_dict):
        text = text_dict['text']
        text = clean(text, clean_all = False,
						   extra_spaces = True,
						   stemming = False,
						   stopwords = False,
						   lowercase = True,
						   numbers = False,
						   punct = True
					)
        return text

    def find_custom_topic(self, custom_topic, text_file):
        podcast_score = 0
        keyword_match = {}
        text = self.clean_text(pickle.load(open(os.path.join(self.text_data_path, text_file), 'rb')))
        for keyword, score in custom_topic['keyword']:
            count = text.count(keyword)
            keyword_match[keyword] = count
            podcast_score += score * count
        return podcast_score >= custom_topic['total_score'], keyword_match   


In [23]:
custom_topic = {'custom_topic': 'euro footbal 2024',
'keyword': [('euro 2024', 100),
  ('footbal', 10),
  ('match de pool', 25),
  ('match pool', 25),
  ('équipe de france', 15),
  ('équipe france', 15),
  ('les bleus', 15),
  ('bleus', 15),
  ('didier deschamps', 100),
  ('kylian mbappé', 25),
  ('allemagne', 10)],
 'id': 12,
 'total_score': 100}

In [29]:
result, keyword_match = Custom_Topics().find_custom_topic(custom_topic, 'podcast.pkl')

In [30]:
print(result, keyword_match)

True {'euro 2024': 5, 'footbal': 3, 'match de pool': 0, 'match pool': 0, 'équipe de france': 4, 'équipe france': 0, 'les bleus': 3, 'bleus': 4, 'didier deschamps': 2, 'kylian mbappé': 1, 'allemagne': 5}


In [27]:
score

970

In [27]:
topic = 'Euro footbal 2024'
keyword_score_list = ['euro 2024', 100,
                      'footbal', 10,
                      'match (de) pool', 25,
                      'équipe (de) france', 15,
                      '(les) bleus', 15,
                      'Didier Deschamps', 100,
                      'Kylian Mbappé', 25,
                      'Allemagne', 10
                      ]
start_date = '14-12-2023'
end_date = '7-14-2024'
custom_topic = Custom_Topic(topic, keyword_score_list, end_date = end_date)

{'custom_topic': {'custom_topic': 'euro footbal 2024',
                  'end_date': datetime.date(2024, 7, 14),
                  'keyword_score': [('euro 2024', 100),
                                    ('footbal', 10),
                                    ('match de pool', 25),
                                    ('match pool', 25),
                                    ('équipe de france', 15),
                                    ('équipe france', 15),
                                    ('les bleus', 15),
                                    ('bleus', 15),
                                    ('didier deschamps', 100),
                                    ('kylian mbappé', 25),
                                    ('allemagne', 10)],
                  'start_date': datetime.date(2023, 12, 12),
                  'total_score': 100}}


In [5]:
class Database:
    def __init__(self, env = 'prod'):
        with open(os.path.join(PATH_CONFIG, DB_CONFIG)) as file:
            database_info = json.load(file)
        self.conn_common = self._database_conn(database_info[env], 'common')
        self.conn_dmp = self._database_conn(database_info[env], 'dmp')

    def _database_conn(self, database_info, database):
        return 'DRIVER={};\
                    SERVER={};\
                    DATABASE={};\
                    UID={};\
                    PWD={};\
                    TrustServerCertificate=yes'.format(
            database_info['driver'], database_info['server'], 
            database_info['database'][database],
            database_info['username'], database_info['password']
        )

    def get_custom_topic_id(self, custom_topic):
        conn = pyodbc.connect(self.conn_dmp)
        query = """SELECT Id
                   FROM dbo.CustomTopics
                   WHERE CustomTopic = {}
                """.format(custom_topic.lower())
        cursor = conn.cursor()
        cursor.execute(query)
        custom_topic_id = cursor.fetchone()[0]
        cursor.close()
        return custom_topic_id 

    def write_custom_topic(self, custom_topic):
        conn = pyodbc.connect(self.conn_dmp)
        query = """INSERT INTO dbo.CustomTopics
                    (CustomTopic, TotalScore, Active, 
                    StartDate, EndDate, CreatedDate, UpdatedDate)
                   VALUES
                    ('{}', {}, 'True', '{}', '{}', '{}', '{}')
                """.format(
                    custom_topic['custom_topic'],
                    custom_topic['total_score'],
                    custom_topic['start_date'],
                    custom_topic['end_date'],
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # CreatedDate
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # UpdatedDate
                )
        cursor = conn.cursor()
        cursor.execute(query)
        conn.commit()
        last_custom_id = cursor.lastrowid
        cursor.close()
        return last_custom_id
    
    def write_custom_topic_keyword(self, values):
        conn = pyodbc.connect(self.conn_dmp)
        query = """INSERT INTO dbo.CustomTopicsKeywords
                    (Keyword, Score, CustomTopicId, CreatedDate, UpdatedDate)
                   VALUES
                    (%s, %d, %d, %s, %s)
                """
        cursor = conn.cursor()
        cursor.executemany(query, values)
        conn.commit()
        cursor.close()
