# svd 50 + preprocess(쿼리도) + 유사도 0.12

In [91]:
import csv
from datetime import datetime

########################################################### 파일 읽기
def read_csv_to_dict(file_paths, keys):
    data_dict = {}
    
    for file_path, key in zip(file_paths, keys):
        articles = []
        with open(file_path, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                titles = row['Title']
                if len(row) > 1:                
                    date_with_time = row['Date']
                    date_obj = datetime.strptime(date_with_time, "%Y/%m/%d %H:%M")                
                    # Extract only the date part (stripping out the time)
                    date_only = date_obj.strftime("%Y/%m/%d")
                else: 
                    date_only = None
                article = {
                    'title': titles,
                    'date': date_only
                }
                articles.append(article)
        
        data_dict[key] = articles
    
    return data_dict

########################################################### precision/recall/f1
def calculate_precision_recall(ground_truth, model_results):
    true_positives = set([article['title'] for article in ground_truth]).intersection(set([article['title'] for article in model_results]))
    precision = len(true_positives) / len(model_results) if model_results else 0
    recall = len(true_positives) / len(ground_truth) if ground_truth else 0
    return precision, recall

def calculate_f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def evaluate_1(ground_truths, boolean_results, vector_results):
    results_b = []
    results_v = []

    for key in ground_truths.keys():
        ground_truth = ground_truths[key]
        boolean = boolean_results[key]
        vector = vector_results[key]
        
        # Boolean model evaluation
        precision_b, recall_b = calculate_precision_recall(ground_truth, boolean)
        f1_score_b = calculate_f1_score(precision_b, recall_b)
        print(f"==================== result of boolean model [ '{key}' ] ====================")
        print(f"Precision: {precision_b:.2f}, Recall: {recall_b:.2f}")
        print(f"F1 Score: {f1_score_b:.2f}\n")
        results_b.append((precision_b, recall_b, f1_score_b))

        # Vector model evaluation
        precision_v, recall_v = calculate_precision_recall(ground_truth, vector)
        f1_score_v = calculate_f1_score(precision_v, recall_v)
        print(f"==================== result of vector model [ '{key}' ] ====================")
        print(f"Precision: {precision_v:.2f}, Recall: {recall_v:.2f}")
        print(f"F1 Score: {f1_score_v:.2f}\n")
        results_v.append((precision_v, recall_v, f1_score_v))

    # Average for boolean model
    average_precision_b = sum([result[0] for result in results_b]) / len(results_b)
    average_recall_b = sum([result[1] for result in results_b]) / len(results_b)
    average_f1_score_b = sum([result[2] for result in results_b]) / len(results_b)

    print(f"Average Precision of boolean model: {average_precision_b:.2f}")
    print(f"Average Recall of boolean model: {average_recall_b:.2f}")
    print(f"Average F1 Score of boolean model: {average_f1_score_b:.2f}\n")

    # Average for vector model
    average_precision_v = sum([result[0] for result in results_v]) / len(results_v)
    average_recall_v = sum([result[1] for result in results_v]) / len(results_v)
    average_f1_score_v = sum([result[2] for result in results_v]) / len(results_v)

    print(f"Average Precision of vector model: {average_precision_v:.2f}")
    print(f"Average Recall of vector model: {average_recall_v:.2f}")
    print(f"Average F1 Score of vector model: {average_f1_score_v:.2f}")

    return (average_precision_b, average_recall_b, average_f1_score_b), (average_precision_v, average_recall_v, average_f1_score_v), (results_b, results_v)

In [92]:
label_data = ['datasets/총선.csv', 'datasets/선거.csv', 'datasets/제22대_국회의원선거.csv', 'datasets/4월_10일_투표.csv']
boolean_data = ['datasets/Boolean_총선.csv', 'datasets/Boolean_선거.csv', 'datasets/Boolean_제22대.csv', 'datasets/Boolean_410투표.csv']
vector_data = ['datasets/Vector_총선.csv', 'datasets/Vector_선거.csv', 'datasets/Vector_제22대.csv', 'datasets/Vector_410투표.csv']

keys = ['총선', '선거', '제22대_국회의원선거', '4월_10일_투표']

boolean_results = read_csv_to_dict(boolean_data, keys)
vector_results = read_csv_to_dict(vector_data, keys)
ground_truths = read_csv_to_dict(label_data, keys)

In [93]:
_, _, results = evaluate_1(ground_truths, boolean_results, vector_results)

Precision: 0.96, Recall: 0.62
F1 Score: 0.75

Precision: 0.72, Recall: 0.76
F1 Score: 0.74

Precision: 0.93, Recall: 0.33
F1 Score: 0.49

Precision: 0.82, Recall: 0.57
F1 Score: 0.67

Precision: 0.47, Recall: 0.22
F1 Score: 0.30

Precision: 0.09, Recall: 0.91
F1 Score: 0.16

Precision: 1.00, Recall: 0.05
F1 Score: 0.09

Precision: 0.04, Recall: 0.67
F1 Score: 0.08

Average Precision of boolean model: 0.84
Average Recall of boolean model: 0.30
Average F1 Score of boolean model: 0.41

Average Precision of vector model: 0.42
Average Recall of vector model: 0.72
Average F1 Score of vector model: 0.41


In [94]:
class TimeRelevanceScorer:
    def __init__(self, vector_results, recent=10, articles=30):
        self.vector_results = vector_results
        self.recent = recent
        self.articles = articles
        
    def calculate_time_relevance(self, publication_date):
        publication_datetime = datetime.strptime(publication_date, '%Y/%m/%d')
        reference_datetime = datetime(2024, 4, 10)
        
        # Calculate days since publication
        days_since_publication = (reference_datetime - publication_datetime).days
        
        # Return 1 if within recent days, otherwise 0
        if days_since_publication <= int(self.recent):
            return 1  
        else:
            return 0  

    def time_relevance_scores(self, keys):
        self.time_relevance_scores = {}

        for key in keys:
            if key in self.vector_results:
                articles = self.vector_results[key]  # Get list of articles for the key
                
                key_time_relevance_scores = []  # Initialize relevance scores list

                for article in articles:
                    article_date = article['date']
                    if article_date:  # Ensure date is not None
                        time_bool = self.calculate_time_relevance(article_date)
                        key_time_relevance_scores.append(time_bool)
                
                # Calculate average time relevance score
                if key_time_relevance_scores:
                    recent_article_ratio = sum(key_time_relevance_scores) / len(key_time_relevance_scores)
                else:
                    recent_article_ratio = 0
                
                self.time_relevance_scores[key] = recent_article_ratio

        return self.time_relevance_scores
    
    def calculate_overall_average_score(self):        
        all_scores = [score for score in self.time_relevance_scores.values()]
        
        if all_scores: 
            overall_average_score = sum(all_scores) / len(all_scores)
        else:
            overall_average_score = 0  

        return overall_average_score

In [99]:
keys = ['총선', '선거', '제22대_국회의원선거', '4월_10일_투표']
scorer = TimeRelevanceScorer(vector_results)
average_time_relevance_scores = scorer.time_relevance_scores(keys)

for key, average_score in average_time_relevance_scores.items():
    print(f"'{key}'\'s time relevance score : {average_score:.4f}")

overall_average_score = scorer.calculate_overall_average_score()
print(f"\noverall average time relevance score: {overall_average_score:.4f}")

'총선''s time relevance score : 0.1523
'선거''s time relevance score : 0.2611
'제22대_국회의원선거''s time relevance score : 0.2461
'4월_10일_투표''s time relevance score : 0.5809

overall average time relevance score: 0.3101


In [102]:
def calculate_combined_metric(f1_score, time_relevance_score, alpha=0.7, beta=0.3):
    combined_score = alpha * f1_score + beta * time_relevance_score
    return combined_score

f1 = [f[-1] for f in results[1]]
f1_score = f1
combined_results = []

for i, key in enumerate(ground_truths.keys()):
    combined_metric = calculate_combined_metric(f1[i], average_time_relevance_scores[key], alpha=0.9, beta=0.1)
    print(f"Combined Metric for '{key}': {combined_metric:.4f}")
    combined_results.append(combined_metric)

average_combined = sum([result for result in combined_results]) / len(combined_results)
print(f'\nOverall average combined score: {average_combined:.2f}')

Combined Metric for '총선': 0.6801
Combined Metric for '선거': 0.6276
Combined Metric for '제22대_국회의원선거': 0.1685
Combined Metric for '4월_10일_투표': 0.1268

Overall average combined score: 0.40
