## **Importing Basic Libraries**

In [None]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import pickle

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [None]:
df_train = pd.read_csv('/kaggle/input/deadline-data/data.csv')

In [None]:
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    list =[]
    for token in nlp(text):
        if not token.is_space and not token.is_punct:
            list.append(token.lemma_.lower())
    return ' '.join(list)

In [None]:
df_train['preprocessed_sentence']= df_train.Sentences.apply(preprocess)

In [None]:
df_train['deadline'].fillna("", inplace=True)
df_train['deadline_type'].fillna("", inplace=True)

In [None]:
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler

matcher = Matcher(nlp.vocab)

date_pattern = [
    {"IS_DIGIT": True},
    {"ORTH": {"in": ["/", "-"]}, "OP": "?"},  # Match hyphen or slash, optional
    {"IS_DIGIT": True},
]

matcher.add("DATE_PATTERN", [date_pattern])

def ner_for_deadlines(sentence):
    
    sentence = sentence.replace("/", "-")
    
    doc = nlp(sentence)

    date_entities_ner = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    
    matches = matcher(doc)
    
    date_entities_matcher = [doc[start:end].text for match_id, start, end in matches]
    
    combined_date_entities = date_entities_ner + date_entities_matcher
    
    return " ".join(combined_date_entities)

In [None]:
df_train['NER_dates']=df_train['preprocessed_sentence'].apply(ner_for_deadlines)

In [None]:
from dateutil import parser
from datetime import datetime

def format_date_to_mm_dd_safe(date_entity):
    try:
        parsed_date = parser.parse(date_entity, fuzzy=True)
        if 1900 <= parsed_date.year <= 2100:
            formatted_date = parsed_date.strftime("%m-%d")
            return formatted_date
    
    except (ValueError, OverflowError):
        pass
    return None

In [None]:
df_train['NER_dates_formatted'] = df_train['NER_dates'].apply(lambda x: format_date_to_mm_dd_safe(x) if x else None)

In [None]:
df_train.tail(50)

In [None]:
def balance_plt(df):
    deadline_counts = df['contains_a_deadline'].value_counts()
    plt.figure(figsize=(8, 6))
    plt.bar(deadline_counts.index, deadline_counts.values, tick_label=['Does Not Contain', 'Contains'])
    plt.xlabel('Contains a Deadline')
    plt.ylabel('Count')
    plt.title('Sentences Containing a Deadline vs. Not Containing a Deadline')
    plt.show()

In [None]:
contains_deadline = df_train[df_train['contains_a_deadline'] == 1]
does_not_contain_deadline = df_train[df_train['contains_a_deadline'] == 0]

# Upsample the minority class (contains a deadline) to match the majority class
contains_deadline_upsampled = resample(contains_deadline,
                                       replace=True,  # Sample with replacement
                                       n_samples=len(does_not_contain_deadline),  # Match the majority class
                                       random_state=42)  # For reproducibility

# Combine the upsampled minority class with the majority class
balanced_data = pd.concat([contains_deadline_upsampled, does_not_contain_deadline])

In [None]:
X = balanced_data['preprocessed_sentence']
y_contains_a_deadline = balanced_data['contains_a_deadline']
y_deadline = balanced_data['deadline']
y_deadline_type = balanced_data['deadline_type']

In [None]:
X_train, X_test, y_contains_a_deadline_train, y_contains_a_deadline_test, \
y_deadline_train, y_deadline_test, y_deadline_type_train, y_deadline_type_test = \
    train_test_split(X, y_contains_a_deadline, y_deadline, y_deadline_type, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True)
rf_classifier = RandomForestClassifier()

In [None]:
contains_a_deadline_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
deadline_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
deadline_type_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('classifier', rf_classifier)
])

## **Importing *Pipeline*, *TfidfVectorizer* and *RandomForestClassifier***

In [None]:
contains_a_deadline_pipeline.fit(X_train, y_contains_a_deadline_train)
deadline_pipeline.fit(X_train, y_deadline_train)
deadline_type_pipeline.fit(X_train, y_deadline_type_train)

In [None]:
contains_a_deadline_pred = contains_a_deadline_pipeline.predict(X_test)
deadline_pred = deadline_pipeline.predict(X_test)
deadline_type_pred = deadline_type_pipeline.predict(X_test)

In [None]:
deadline_type_pred_test = pd.Series(deadline_type_pred)
deadline_type_pred_test.head()

In [None]:
deadline_pred_test = pd.Series(deadline_pred)
deadline_pred_test.head()

In [None]:
contains_a_deadline_pred_test = pd.Series(contains_a_deadline_pred)
contains_a_deadline_pred_test.head()

In [None]:
print('Classification Report Contains a Deadline  :\n\n\n ', classification_report(y_contains_a_deadline_test, contains_a_deadline_pred_test))

In [None]:
print('Classification Report Deadline Date  :\n\n\n ', classification_report(y_deadline_test, deadline_pred_test))

In [None]:
print('Classification Report Deadline Type  :\n\n\n ', classification_report(y_deadline_type_test, deadline_type_pred_test))

In [None]:
with open('deadline_type_model.pkl', 'wb') as model_file:
    pickle.dump(deadline_type_pipeline, model_file)

with open('deadline_type_model.pkl', 'rb') as model_file:
    loaded_deadline_type_model = pickle.load(model_file)

with open('tfidf_vectorizer.pickle', 'wb') as vectorizer_file:
    pickle.dump(deadline_type_pipeline.named_steps['tfidf'], vectorizer_file)

with open('tfidf_vectorizer.pickle', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

In [None]:
test_sentence = "we have a midterm in december"

preprocessed_sentence = preprocess(test_sentence)

vectorized_sentence = loaded_vectorizer.transform([preprocessed_sentence])

dense_vectorized_sentence = vectorized_sentence.toarray()

word_list = loaded_vectorizer.inverse_transform(dense_vectorized_sentence)[0]

preprocessed_sentence = preprocess(" ".join(word_list))

prediction = loaded_deadline_type_model.predict([preprocessed_sentence])

print(prediction)

In [None]:
with open('contains_a_deadline_model.pkl', 'wb') as model_file:
    pickle.dump(contains_a_deadline_pipeline, model_file)

with open('contains_a_deadline_model.pkl', 'rb') as model_file:
    loaded_contains_a_deadline_model = pickle.load(model_file)

In [None]:
test_sentence = "we are going to have exam on april 6th"

preprocessed_sentence = preprocess(test_sentence)

vectorized_sentence = loaded_vectorizer.transform([preprocessed_sentence])

dense_vectorized_sentence = vectorized_sentence.toarray()

word_list = loaded_vectorizer.inverse_transform(dense_vectorized_sentence)[0]

preprocessed_sentence = preprocess(" ".join(word_list))

prediction = loaded_contains_a_deadline_model.predict([preprocessed_sentence])

print(prediction)

In [None]:
with open('deadline_model.pkl', 'wb') as model_file:
    pickle.dump(deadline_pipeline, model_file)

with open('deadline_model.pkl', 'rb') as model_file:
    loaded_deadline_model = pickle.load(model_file)

In [None]:
test_sentence = "programming due 19/5"

preprocessed_sentence = preprocess(test_sentence)

vectorized_sentence = loaded_vectorizer.transform([preprocessed_sentence])

dense_vectorized_sentence = vectorized_sentence.toarray()

word_list = loaded_vectorizer.inverse_transform(dense_vectorized_sentence)[0]

preprocessed_sentence = preprocess(" ".join(word_list))

prediction = loaded_deadline_model.predict([preprocessed_sentence])

print(prediction)