In [197]:
# !pip install transformers==4.8.2
# !python setup.py install
# !pip install --upgrade --force-reinstall scikit-learn numpy
# !pip install --upgrade --force-reinstall numpy



# Table of content
----
## English Learning Support
- [1. CEFR English Level Predictor](#1)
  - [1.1. RoBERTa for CEFR English Level Predictor](#1.1)
  - [1.2. XGboost for CEFR English Level Predictor](#1.2)
- [2. Grammar Error Correction](#2)
- [3. Tense Predictions](#3)
- [4. Spelling Check](#4)
- [5. Identify Error Types](#5)
- [6. Structure Prediction Constituency Parser](#6)

## Question Generation and Fact-Check
- [1. T5 for Summarize Task](#7)
- [2. Question Generation](#8)
- [3. Fact-Check](#9)

## Recommendation System
- [1. Association Rule - Apriori](#10)
- [2. Similar Courses](#11)


# English learning support

----
<a id="1"></a>

### 1. CEFR English Level Predictor


<a id="1.1"></a>

#### 1.1. RoBERTa for CEFR English Level Predictor


Get [CEFR_model](https://drive.google.com/drive/folders/1VkdH3IPyoA8KAmpt9YbVgP399KPQvZFV?usp=sharing).

In [3]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification

# Load the pre-trained model and tokenizer

ro_model = RobertaForSequenceClassification.from_pretrained(
    "model/cefr/cefr999_model",
    num_labels=6
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ro_model.to(device)
ro_tokenizer = AutoTokenizer.from_pretrained("model/cefr/cefr999_token")

def predict_english_level(text):
    # Encode the text using the tokenizer
    inputs = ro_tokenizer(
        text,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"].squeeze()
    attention_mask = inputs["attention_mask"].squeeze()

    # Make a prediction
    with torch.no_grad():
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        output = ro_model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        predicted_class = output.logits.argmax().item()

    # Map the predicted class back to the English level
    labels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
    predicted_level = labels[predicted_class]

    return predicted_level

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
text = """

In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here

Firsty , tourism has a great impact on everycountry economy beacuse where foriegners have come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 

Building and when they saw these building buy some tickets . These money spend on building by government .

Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .

"""
# print(len(text))
predicted_level = predict_english_level(text)
print("Predicted English Level:", predicted_level)

Predicted English Level: B1


<a id="1.2"></a>

#### 1.2. XGboost for CEFR English Level Predictor

In [9]:
import textstat
import pandas as pd
# from xgboost import XGBClassifier

import nltk
nltk.download('punkt')

import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize

import joblib

model_filename = "model/cefr/xgboost100.pkl"

xgb_model = joblib.load(model_filename)

def avg_words_per_sentence(text):
    sentences = sent_tokenize(text)
    word_count = sum(len(word_tokenize(sentence)) for sentence in sentences)
    if len(sentences) == 0:
        return 0
    return word_count / len(sentences)

# Hàm để dự đoán trình độ tiếng Anh dựa trên văn bản đầu vào
def predict_english_level_XG(text, xgb_model):

    smog_index = textstat.smog_index(text)
    automated_readability_index = textstat.automated_readability_index(text)
    dale_chall_readability_score = textstat.dale_chall_readability_score(text)
    difficult_words = textstat.difficult_words(text)
    linsear_write_formula = textstat.linsear_write_formula(text)
    gunning_fog = textstat.gunning_fog(text)
    szigriszt_pazos = textstat.szigriszt_pazos(text)
    gutierrez_polini = textstat.gutierrez_polini(text)
    crawford = textstat.crawford(text)
    osman = textstat.osman(text)
    avg = avg_words_per_sentence(text)

    data = pd.DataFrame({
        "smog_index": [smog_index],
        "automated_readability_index": [automated_readability_index],
        "dale_chall_readability_score": [dale_chall_readability_score],
        "difficult_words": [difficult_words],
        "linsear_write_formula": [linsear_write_formula],
        "gunning_fog": [gunning_fog],
        "szigriszt_pazos": [szigriszt_pazos],
        "gutierrez_polini": [gutierrez_polini],
        "crawford": [crawford],
        "osman": [osman],
        "avg_words": [avg]
    })

    predicted_level = xgb_model.predict(data)
    predicted_probabilities = xgb_model.predict_proba(data)
    # Chuyển kết quả từ số sang trình độ tiếng Anh tương ứng
    level_to_index = {0: "A1", 1: "A2", 2: "B1", 3: "B2", 4: "C1", 5: "C2"}
    predicted_level_text = level_to_index[predicted_level[0]]

    return predicted_level_text, predicted_probabilities


[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



In [11]:
text = """
In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here

Firsty , tourism has great impact on everycountry economy beacuse where foriegners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 

Building and when they saw these building buy some tickets . These money spend on building by government .

Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .

"""
predicted_level = predict_english_level(text)
print("Predicted English Level - Neural:", predicted_level)
predicted_level,prob = predict_english_level_XG(text, xgb_model)
print("Predicted English Level - XG:", predicted_level)

Predicted English Level - Neural: B1
Predicted English Level - XG: B2


----
<a id="2"></a>

### 2. Grammar Error Corection

In [202]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
# # model_name = 'deep-learning-analytics/GrammarCorrector'
model_name = 'model/gec_model/gec_03'
# model_name = 't5_gec_model_02' # model path
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
g_tokenizer = T5Tokenizer.from_pretrained(model_name)
g_model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [203]:
def correct_grammar(input_text,num_return_sequences):
  batch = g_tokenizer([input_text],truncation=True,padding='max_length',max_length=128, return_tensors="pt").to(torch_device)
  translated = g_model.generate(**batch,max_length=128,num_beams=3, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = g_tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [204]:
input_text = """
Dear Sir/Madam,

This letter aims to share my opinions about the booked that influenced me the most. The name of the book is "You Can Win" and I found this book very inspiring and knowledgeable. Basically, the book includes real-life examples and motivational language to give a ray of hope to a disappointed person.

I started reading the book when I faced a failure in life and it seemed like the end of life. I read the book recommended to me by one of my friends and when I started reading the book, I got my confidence, motivation and self-esteem back and understood the real meaning of life.

I are highly recommend this book to everyone to achieve better in life and gain an in-depth understanding of life. Moreover, the book is written in such a good way and language which makes it interesting and inspiring when one feels low and demotivated in life. Though I am not a bibliophile, I found this book really very interesting and motivational.

Yours Faithfully,

ABC
"""
num_return_sequences = 3
corrected_texts = correct_grammar(input_text, num_return_sequences)

In [205]:
input_text = """
I agree this statement and my view here.
"""
num_return_sequences = 3
corrected_texts = correct_grammar(input_text, num_return_sequences)

In [206]:
corrected_texts

['I agree with this statement and my view here.',
 'I agree with this statement and my views here.',
 'I agree with this statement and my opinion here.']

In [207]:
import re

def correct_and_merge(input_text, num_return_sequences):
    sentences = re.split(r'[.!?\n]', input_text)
    corrected_sentences = []

    for sentence in sentences:
        words = sentence.split()
        if len(words) > 3:
            corrected_versions = correct_grammar(sentence, num_return_sequences=num_return_sequences)
            corrected_sentences.extend(corrected_versions)
        else:
            corrected_sentences.append(sentence)
    corrected_texts = []
    for i in range(num_return_sequences):

        corrected_texts.append('\n'.join(corrected_sentences[i::num_return_sequences]))

    return corrected_texts

input_text = """
In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here
Firsty , tourism has great impact on everycountry economy beacuse where foriegners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 
Building and when they saw these building buy some tickets . These money spend on building by government .
Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .
"""
num_return_sequences = 2
corrected_texts = correct_and_merge(input_text, num_return_sequences)
for i, corrected_text in enumerate(corrected_texts):
    print(f"Corrected Text {i + 1}:\n{corrected_text}")

Corrected Text 1:

In the modern era tourism has had a great impact on society.
I agree with this statement and my views here.
First of all, tourism has great impact on every country economy beacuse where tourists come to see their tourist place they spend money to buy many things.
Moreover, some tourism places like Jaipur there are many historical buildings and every year lot of people visit these places to see there.
Building and when they saw these building they bought some tickets.
This money is spent on building by government.
Moreover if individuals come from another country then they expand their culture one place to another place and it is a big way to expand our culture.

Corrected Text 2:
In the modern era, tourism has had a great impact on society.
I agree with this statement and my view here.
First of all, tourism has great impact on every country economy beacuse where foreigners come to see their tourist place they spend money to buy many things.
Moreover, some tourism pla

In [208]:
input_text = """
In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here
Firsty , tourism has great impact on everycountry economy beacuse where foriegners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 
Building and when they saw these building buy some tickets . These money spend on building by government .
Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .
"""
num_return_sequences = 2
corrected_text = correct_and_merge(input_text, num_return_sequences)
print(input_text)
corrected_text
# https://writing9.com/text/652464bcd74fc1001274c3d0-nodaway-tourism-has-show-great-impact-on-society-do


In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here
Firsty , tourism has great impact on everycountry economy beacuse where foriegners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 
Building and when they saw these building buy some tickets . These money spend on building by government .
Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .



['\nIn the modern era tourism has had a great impact on society.\nI agree with this statement and my views here.\nFirst of all, tourism has great impact on every country economy beacuse where tourists come to see their tourist place they spend money to buy many things.\nMoreover, some tourism places like Jaipur there are many historical buildings and every year lot of people visit these places to see there.\nBuilding and when they saw these building they bought some tickets.\nThis money is spent on building by government.\nMoreover if individuals come from another country then they expand their culture one place to another place and it is a big way to expand our culture.\n',
 'In the modern era, tourism has had a great impact on society.\nI agree with this statement and my view here.\nFirst of all, tourism has great impact on every country economy beacuse where foreigners come to see their tourist place they spend money to buy many things.\nMoreover, some tourism places like jaipur the

In [209]:
print(corrected_texts[1])

In the modern era, tourism has had a great impact on society.
I agree with this statement and my view here.
First of all, tourism has great impact on every country economy beacuse where foreigners come to see their tourist place they spend money to buy many things.
Moreover, some tourism places like jaipur there are many historical buildings and every year lot of people visit these places to see there.
Building and when they saw these buildings they bought some tickets.
This money is spent on building by the government.

Moreover, if individuals come from another country then they expand their culture one place to another place and it is a big way to expand our culture.



----
<a id="3"></a>

### 3. Tense Predictions

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, BertModel

model_name = "bert-base-uncased"
tense_model = BertModel.from_pretrained(model_name)
tense_tokenizer = AutoTokenizer.from_pretrained("model/tense_model/tense_tokenizer")


In [None]:
class TenseClassifier(nn.Module):
  
    def __init__(self, bert_model, num_classes):
        super(TenseClassifier, self).__init__()
        self.bert = bert_model
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        x = self.relu1(pooled_output)
        x = self.relu2(x)
        logits = self.fc(x)
        return logits

model_path = "model/tense_model/tense.pt"
tense_model = TenseClassifier(tense_model, num_classes=12)

tense_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')), strict=False)

In [None]:
tense_labels = {
    'present simple': 0,
    'future simple': 1,
    'past simple': 2,
    'present perfect continuous': 3,
    'future perfect': 4,
    'past perfect': 5,
    'future continuous': 6,
    'past perfect continuous': 7,
    'present continuous': 8,
    'past continuous': 9,
    'future perfect continuous': 10,
    'present perfect': 11,
}
def predict_tense(sentence, model, tokenizer, tense_labels):
    # tokenizer
    encoded_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        logits = model(encoded_sentence['input_ids'], encoded_sentence['attention_mask'])
        predicted_label = torch.argmax(logits, dim=1).item()
    
    predicted_tense = [k for k, v in tense_labels.items() if v == predicted_label][0]
    
    return predicted_tense

sentence_to_predict = """
In 2009, the inhabitants in Vietnam reached 95 million individuals.
"""

predicted_tense = predict_tense(sentence_to_predict, tense_model, tense_tokenizer, tense_labels)
print(f"The predicted tense for the sentence is: {predicted_tense}")

----
<a id="4"></a>

### 4. Spelling check

In [210]:
import enchant
import string

dictionary = enchant.Dict("en_US")

sentence = """
In contemporary era thattourism had been show great  impact on society . I agree this statement and my view here
Firsty , tourism has great impact on everycountry economy beacuse where foriegners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a jaipur there is many historical builiding and every year lot of people visit these place to see there 
Building and when they saw these building buy some tickets . These money spend on building by government .
Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way exploure our culture .
"""

def spell_checker(sentence):
    words = sentence.split()
    corrected_sentence = []

    corrections = {}

    for word in words:
        original_word = word.strip(string.punctuation)

        if not original_word:
            corrected_sentence.append(word)  # Skip empty words
        elif not dictionary.check(original_word):
            suggestions = dictionary.suggest(original_word)
            if suggestions:
                corrected_word = suggestions[0]  # Use the first suggestion
                corrections[original_word] = corrected_word
                corrected_sentence.append(word.replace(original_word, corrected_word))
            else:
                corrected_sentence.append(word)
        else:
            corrected_sentence.append(word)

    corrected_text = ' '.join(corrected_sentence)
    return corrected_text, corrections

spell_check, corrections = spell_checker(sentence)
print("Corrected Text:")
print(spell_check)
print("Corrections:")
print(corrections)


Corrected Text:
In contemporary era that tourism had been show great impact on society . I agree this statement and my view here First , tourism has great impact on every country economy because where foreigners are come to see their tourist place they spend money to buy many things . Moreover , some tourism place like a Jaipur there is many historical building and every year lot of people visit these place to see there Building and when they saw these building buy some tickets . These money spend on building by government . Moreover if they another countries individuals come their country than they expand their culture one place to another place and it is big way explore our culture .
Corrections:
{'thattourism': 'that tourism', 'Firsty': 'First', 'everycountry': 'every country', 'beacuse': 'because', 'foriegners': 'foreigners', 'jaipur': 'Jaipur', 'builiding': 'building', 'exploure': 'explore'}


In [211]:
num_return_sequences = 2
corrected_texts = correct_and_merge(spell_check, num_return_sequences)
corrected_texts

['In the modern era, tourism has had a great impact on society.\nI agree with this statement and my view here First, tourism has great impact on every country economy because where foreigners come to see their tourist place they spend money to buy many things.\nMoreover, some tourism places like Jaipur there are many historical buildings and every year lot of people visit these places to see there Building and when they see these buildings buy some tickets.\nThis money is spent on building by the government.\nMoreover if individuals come from another country then they expand their culture one place to another place and it is a big way to explore our culture.\n',
 'In a contemporary era, tourism has had a great impact on society.\nI agree with this statement and my view here. First, tourism has great impact on every country economy because where foreigners come to see their tourist place they spend money to buy many things.\nMoreover, some tourism places like Jaipur there are many histo

In [212]:
import difflib
def highlight(correct_sentence, error_Sentence, color):
    differ = difflib.Differ()
    diff = list(differ.compare(correct_sentence.split(), error_Sentence.split()))

    highlighted_diff = []
    for word in diff:
        if word.startswith(' '):
            highlighted_diff.append(word[2:])
        elif word.startswith('- '):
            highlighted_diff.append('<span style="background-color:'+color+';">{}</span>'.format(word[2:]))
    
    highlighted_sentence = ' '.join(highlighted_diff)

    return highlighted_sentence

mark_f = highlight(sentence,corrected_texts[0],"#FF6666")
mark_t = highlight(corrected_texts[0],sentence,"#7ED957")
print("<p>"+mark_t+"</p>")
print("<p>"+mark_f+"</p>")

In <span style="background-color:#7ED957;">the</span> <span style="background-color:#7ED957;">modern</span> <span style="background-color:#7ED957;">era,</span> <span style="background-color:#7ED957;">tourism</span> <span style="background-color:#7ED957;">has</span> had <span style="background-color:#7ED957;">a</span> great impact on <span style="background-color:#7ED957;">society.</span> I agree <span style="background-color:#7ED957;">with</span> this statement and my view here <span style="background-color:#7ED957;">First,</span> tourism has great impact on <span style="background-color:#7ED957;">every</span> <span style="background-color:#7ED957;">country</span> economy <span style="background-color:#7ED957;">because</span> where <span style="background-color:#7ED957;">foreigners</span> come to see their tourist place they spend money to buy many <span style="background-color:#7ED957;">things.</span> <span style="background-color:#7ED957;">Moreover,</span> some tourism <span style="b

----
<a id="5"></a>

### 5. Identify error types

***You need to run all the code in the English learning support section to execute the following part.***

In [213]:
import difflib

def identify_error_types(paragraph, corrected_paragraph, corrected_words):
    differ = difflib.Differ()
    
    list_wrong_grammar_sentences = []
    list_wrong_spell_sentences = []
    
    for orig_sentence, corr_sentence in zip(paragraph, corrected_paragraph):
        diff = list(differ.compare(orig_sentence.split(), corr_sentence.split()))
        
        grammar_errors = []
        spelling_errors = []

        for word_diff in diff:
            word = word_diff[2:]
            
            if word_diff.startswith('- '): 
                if word in corrected_words:
                    spelling_errors.append(word)
                else:
                    grammar_errors.append(word)

        if grammar_errors:
            list_wrong_grammar_sentences.append(orig_sentence)
        elif spelling_errors and not grammar_errors:
            list_wrong_spell_sentences.append(orig_sentence)
    
    return list_wrong_grammar_sentences, list_wrong_spell_sentences

wrong_grammar, wrong_spelling = identify_error_types(corrected_texts[0].split('\n'), sentence.split('\n'), corrections)

print("Sentences with Grammar Errors:")
for sentence in wrong_grammar:
    print(sentence)

print("Sentences with Spelling Errors:")
for sentence in wrong_spelling:
    print(sentence)


Sentences with Grammar Errors:
In the modern era, tourism has had a great impact on society.
I agree with this statement and my view here First, tourism has great impact on every country economy because where foreigners come to see their tourist place they spend money to buy many things.
Moreover, some tourism places like Jaipur there are many historical buildings and every year lot of people visit these places to see there Building and when they see these buildings buy some tickets.
This money is spent on building by the government.
Moreover if individuals come from another country then they expand their culture one place to another place and it is a big way to explore our culture.
Sentences with Spelling Errors:


In [214]:
w_tense = []
for sentence in wrong_grammar:
    w_tense.append(predict_tense(sentence_to_predict, tense_model, tense_tokenizer, tense_labels))

wrong = []
# result = [{"paragraph": list_para[i], "summary": summaries[i]} for i in range(len(summaries))]
wrong = [{"wrong_sentence": wrong_grammar[i], "tense_labels": w_tense[i]}for i in range(len(w_tense))]
print([print(i["wrong_sentence"],":",i["tense_labels"]) for i in wrong])
print(corrections)

In the modern era, tourism has had a great impact on society. : past
I agree with this statement and my view here First, tourism has great impact on every country economy because where foreigners come to see their tourist place they spend money to buy many things. : past
Moreover, some tourism places like Jaipur there are many historical buildings and every year lot of people visit these places to see there Building and when they see these buildings buy some tickets. : past
This money is spent on building by the government. : past
Moreover if individuals come from another country then they expand their culture one place to another place and it is a big way to explore our culture. : past
[None, None, None, None, None]
{'thattourism': 'that tourism', 'Firsty': 'First', 'everycountry': 'every country', 'beacuse': 'because', 'foriegners': 'foreigners', 'jaipur': 'Jaipur', 'builiding': 'building', 'exploure': 'explore'}


----
<a id="6"></a>

### 6. Structure prediction constituency parser

***This part is no longer necessary.***

In [216]:
# !pip uninstall -q allennlp
# !pip uninstall -q allennlp-models

In [217]:
# !pip install -q https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

In [218]:
# !python -m spacy download en

In [219]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [220]:
# This is unnecessary 
# !pip install h5py
# !pip install typing-extensions
# !pip install wheel

In [221]:
# !pip install --ignore-installed six

In [None]:
from allennlp_models import pretrained
# print(pretrained.get_pretrained_models())
predictor = pretrained.load_predictor("structured-prediction-constituency-parser")

In [None]:
test_sentence = "he is doing home work. She phaying game."
# test_sentence = test_sentence.rstrip('?:!.,;')
print (test_sentence)
parser_output = predictor.predict(test_sentence)
# print (parser_output)
tag = parser_output["pos_tags"]

In [None]:
tag

In [None]:
tree = parser_output["trees"]
tree

In [None]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree)
print(tree)
print(tree.pretty_print())

In [227]:
import spacy
from collections import defaultdict
import re
# Tải mô hình ngôn ngữ tiếng Anh của spaCy
nlp = spacy.load("en_core_web_sm")

def get_sentence_structure(sentence):
    # Sử dụng spaCy để phân tích câu và lấy danh sách các từ (tokens) và POS tags của câu
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, pos_tags

In [228]:
def find_duplicate_sentence_structures(text):
    # Sử dụng module re để tách câu theo nhiều dấu câu khác nhau
    sentences = re.split(r'[.,!?]', text)
    
    sentence_structures = defaultdict(list)

    for sentence in sentences:
        _, pos_tags = get_sentence_structure(sentence)

        structure_key = tuple(pos_tags)

        sentence_structures[structure_key].append(sentence)

    found_duplicates = False 
    for structure_key, sentences in sentence_structures.items():
        if len(sentences) > 1:
            found_duplicates = True
            print(f"Similar Struture: {structure_key}")
            print("Similar structure sentences:")
            for sentence in sentences:
                print(f"- {sentence.strip()}\n")  # Loại bỏ khoảng trắng dư thừa
            print()
    
    if not found_duplicates:
        print("Không có câu nào lặp cấu trúc.")

# Ví dụ sử dụng
text = """
In 2009, the inhabitants in Vietnam reached 95 million individuals.
Simultaneously, the population in Japan hit 50 million citizens.
"""
find_duplicate_sentence_structures(corrected_texts[0])

Không có câu nào lặp cấu trúc.


# Question Generation and Fact - Check

----
<a id="7"></a>

### 1. T5 for summarize task

In [None]:
# import fitz  # PyMuPDF
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small" # model
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(model_name)

In [21]:
import re
import textract

# Đọc nội dung của tài liệu PDF
text = textract.process('pdf/Report.pdf', encoding='utf-8')

# Sử dụng biểu thức chính quy để cắt thành các đoạn văn
all_paragraphs = re.split(r'\s{2,}', text.decode('utf-8'))
num_paragraph= len(text)
print(num_paragraph)

13876


In [22]:
list_para = []
list_para = [para for para in all_paragraphs if len(para.split()) >= 20] # list of paragraphs which have more than 20 words
len(list_para)

21

In [23]:
# Summarize paragraph using t5 model

summaries = []
for i,paragraph in enumerate(list_para):
    input_text = "summarize: " + paragraph
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = t5_model.generate(input_ids, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)
    
    # if i==3: break

In [24]:
list_para[:4]

['Abstract. This paper aims to develop a system that will help in recommendation of courses for an upcoming semester based on the performance of previous semesters.',
 "It has always been a tough choice for the students to choose the courses in different semesters in which there is possibility to score good grades apart from the interest in the course. IIIT-Delhi offers variety of courses with mandatory courses in first 4 semesters (with exception of 2 to 3 electives) and all elective courses from fifth semester onwards. Hence, choosing the courses based on the verbal recommendation from the seniors, instructors and fellowmates becomes a hectic task. For easing this process of course recommendation for an upcoming semester, we have developed a system which deploys simple yet powerful recommendation techniques such as auto-encoders, hybrid matrix factorization and similarity based approaches. It is a GUI based system which takes an input of student's ID (which is stored in the backend d

In [25]:
# merge into result 

result = [{"paragraph": list_para[i], "summary": summaries[i]} for i in range(len(summaries))]

In [26]:
result[:5]

[{'paragraph': 'Abstract. This paper aims to develop a system that will help in recommendation of courses for an upcoming semester based on the performance of previous semesters.',
  'summary': 'this paper aims to develop a system that will help in recommendation of courses for an upcoming semester based on the performance of previous semesters. this paper aims to develop a system that will help in recommendation of courses based on the performance of previous semesters.'},
 {'paragraph': "It has always been a tough choice for the students to choose the courses in different semesters in which there is possibility to score good grades apart from the interest in the course. IIIT-Delhi offers variety of courses with mandatory courses in first 4 semesters (with exception of 2 to 3 electives) and all elective courses from fifth semester onwards. Hence, choosing the courses based on the verbal recommendation from the seniors, instructors and fellowmates becomes a hectic task. For easing this

----
<a id="8"></a>

### 2. Question generation

***You have to summary the file before this task***

In [87]:
# Clone this responsitory for questions generator

# !git clone https://github.com/amontgomerie/question_generator
# !pip install -r question_generator/requirements.txt -qq
# !python run_qg.py --text_file question_generator/articles/twitter_hack.txt

python: can't open file 'c:\\Users\\Sysme Hue\\Desktop\\LMS\\Libs\\run_qg.py': [Errno 2] No such file or directory


In [90]:
# %cd question_generator

c:\Users\Sysme Hue\Desktop\LMS\Libs\question_generator


In [92]:
# import file from another folder
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, 'question_generator')

from questiongenerator import QuestionGenerator

In [None]:
qg = QuestionGenerator()

In [None]:
text = result[2]["summary"]
q  = qg.generate(text, num_questions=3)

In [37]:
q

[{'question': 'how many students have taken the course?',
  'answer': 'the dataset consists 739 students and 306 subjects with mapping of each student to the grades for each course the student has taken throughout the duration of their degree.'},
 {'question': 'how many students have taken the iitt?',
  'answer': 'the dataset has been acquired from the official IIIT-Delhi academics department for the students of 7 Computer Science passout batches.'},
 {'question': 'how many subjects do students have taken?',
  'answer': [{'answer': 'dataset', 'correct': False},
   {'answer': '7 Computer Science', 'correct': False},
   {'answer': '306', 'correct': True},
   {'answer': '739', 'correct': False}]}]

In [38]:
print(text)
for i in range(len(q)): print(q[i]["question"]) #  7 Computer Science passout batches.

the dataset has been acquired from the official IIIT-Delhi academics department for the students of 7 Computer Science passout batches. the dataset consists 739 students and 306 subjects with mapping of each student to the grades for each course the student has taken throughout the duration of their degree.
how many students have taken the course?
how many students have taken the iitt?
how many subjects do students have taken?


----
<a id="9"></a>

### 3. Fact -check

In [36]:
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel

In [37]:
# BERT MODEL

class BERTClassificationModel(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(BERTClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size * 2, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.cat((outputs.last_hidden_state[:, 0, :], outputs.last_hidden_state[:, -1, :]), dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [38]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [None]:
import torch
model_path = "model/bert_classification_model.pth"
# Tạo mô hình mới
loaded_model = BERTClassificationModel('bert-base-uncased', num_labels=3)


In [41]:
if torch.cuda.is_available():
    loaded_model.load_state_dict(torch.load(model_path))
else:
    loaded_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')), c)

In [22]:
# Tải lại tokenizer từ đường dẫn đã lưu
loaded_tokenizer = BertTokenizer.from_pretrained("model/tokenizer")

In [23]:
max_seq_length = 128
def predict_premise_hypothesis(premise_text, hypothesis_text, model, tokenizer):
    # Chuẩn bị dữ liệu đầu vào cho mô hình
    inputs = tokenizer(premise_text, hypothesis_text, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Dự đoán
    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()

    return predicted_label

pre = ["entailment", "neutral", "contradiction"]
# Sử dụng hàm predict_premise_hypothesis để dự đoán

premise_text = "Yesterday, I bought a new computer"
hypothesis_text = "I did not buy a new computer"
predicted_label = predict_premise_hypothesis(premise_text, hypothesis_text, loaded_model, loaded_tokenizer)

print("premise_text :", premise_text, "\n", "hypothesis_text :", hypothesis_text)
print("Predicted Label:", pre[predicted_label])


premise_text : Yesterday, I bought a new computer 
 hypothesis_text : I did not buy a new computer
Predicted Label: contradiction


In [54]:
pre = ["entailment", "neutral", "contradiction"]
# Sử dụng hàm predict_premise_hypothesis để dự đoán
premise_text = "The likelihood is 100%"
hypothesis_text = "Maybe the probability is 100%"
predicted_label = predict_premise_hypothesis(premise_text, hypothesis_text, loaded_model, loaded_tokenizer)

print("premise_text :", premise_text, "\n", "hypothesis_text :", hypothesis_text)
print("Predicted Label:", pre[predicted_label])

premise_text : The likelihood is 100% 
 hypothesis_text : Maybe the probability is 100%
Predicted Label: neutral


In [None]:
premise_text = "The dataset had acquired for the student of 7 computer science"
hypothesis_text = result[2]["summary"]

predicted_label = predict_premise_hypothesis(premise_text, hypothesis_text, loaded_model, loaded_tokenizer)
print("premise_text :", premise_text, "\n", "hypothesis_text :", hypothesis_text)
print("Predicted Label:", pre[predicted_label])

# Recommend system

----
<a id="10"></a>

### 1. Association Rule - Apriori

In [1]:
import pandas as pd
import numpy as np
import re
from mlxtend.frequent_patterns import apriori, association_rules
import en_core_web_sm
spc_en = en_core_web_sm.load()

  from .autonotebook import tqdm as notebook_tqdm
  np.bool8: (False, True),
  np.bool8: (False, True),


In [2]:
# dataset --> https://www.kaggle.com/datasets/leewanhung/coursera-dataset?select=df_c.csv
course = pd.read_csv("../data/course.csv")
# id = pd.read_csv("../data/id.csv")

In [4]:
course.head()

Unnamed: 0.1,Unnamed: 0,Course_id,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills,Specialized,Sequence,Vector,IDF,BERT_Encoded
0,0,1,write feature length screenplay film television,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,write full length feature film script course w...,drama comedy peer screenwrite film document re...,Arts and Humanities,write feature length screenplay film televisio...,[3.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[ 3.09796989e-01 8.47294629e-02 4.00437862e-...
1,1,2,business strategy business model canvas analys...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,end guide project fluent identifying create bu...,finance business plan persona user experience ...,Business,business strategy business model canvas analys...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[-1.62371062e-02 -5.21499477e-02 3.00578535e-...
2,2,3,silicon thin film solar cell,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,course consist general presentation solar cell...,chemistry physics solar energy film lambda cal...,Physical Science and Engineering,silicon thin film solar cell course consist ge...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[ 1.11733921e-01 2.74278939e-01 4.72732037e-...
3,3,4,finance manager,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,come number always meet eye operational financ...,account receivable dupont analysis analysis ac...,Business,finance manager come number always meet eye op...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[-4.35875319e-02 1.21336073e-01 3.25895369e-...
4,4,5,retrieve datum use single table sql query,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,course learn effectively retrieve datum relati...,datum analysis select sql database management ...,Information Technology,retrieve datum use single table sql query cour...,[117.0 34.0 27.0 33.0 258.0 201.0 42.0 0.0 125...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[ 6.04763143e-02 3.37209553e-02 4.53004211e-...


In [5]:
history = pd.read_csv("../data/enrolled_course.csv", encoding='latin-1')

In [None]:
history # 395	user0396	

Unnamed: 0,User_id,History_course_id,History_course_name
0,user0001,"0687, 0355, 1677","Sequence Models, Natural Language Processing w..."
1,user0002,"0636, 0637, 1787","Exploratory Data Analysis, Exploratory Data An..."
2,user0003,"0120, 0286, 2712","Managing Big Data with MySQL, Introduction to ..."
3,user0004,"0406, 0603, 0840",Data Visualization and Communication with Tabl...
4,user0005,"0241, 2558, 3025, 3503",Deploy Models with TensorFlow Serving and Flas...
...,...,...,...
2133,user2134,"0685, 0819",
2134,user2135,"1677, 2924, 1755, 2754",
2135,user2136,"3361, 3369, 3399",
2136,user2137,"0685, 0819",


In [6]:
data = list(history['History_course_id'].apply(lambda x:x.split(",") ))

In [7]:
data

[['0687', ' 0355', ' 1677'],
 ['0636', ' 0637', ' 1787'],
 ['0120', ' 0286', ' 2712'],
 ['0406', ' 0603', ' 0840'],
 ['0241', ' 2558', ' 3025', ' 3503'],
 ['0450', ' 0662', ' 2696'],
 ['0391', ' 1808', ' 3454'],
 ['2533', ' 2543', ' 2564'],
 ['2919', ' 3097', ' 3260'],
 ['1581', ' 2154', ' 2725'],
 ['0088', ' 0112', ' 0233'],
 ['0211', ' 0220', ' 0241'],
 ['0149', ' 0211', ' 0288'],
 ['0220', ' 0241', ' 0288'],
 ['0199', ' 0211', ' 0241'],
 ['0120', ' 0286', ' 0413'],
 ['0066', ' 0149', ' 0396'],
 ['0108', ' 0279', ' 0687'],
 ['0330', ' 0376', ' 0530'],
 ['0058', ' 0221', ' 0849'],
 ['0485', ' 1210', ' 1792'],
 ['1366', ' 2486', ' 3352'],
 ['3399', ' 3423', ' 3454'],
 ['0543', ' 2854', ' 3352'],
 ['2845', ' 3349', ' 3361'],
 ['3352', ' 3369', ' 3399'],
 ['3334', ' 3352', ' 3361'],
 ['3361', ' 3369', ' 3370'],
 ['3361', ' 2708'],
 ['3352', ' 3369', ' 3376'],
 ['3352', ' 3369', ' 3423'],
 ['3454', ' 3465', ' 3472'],
 ['3465', ' 3472', ' 3475'],
 ['3475', ' 3483', ' 3491'],
 ['0636', ' 06

In [8]:
#Let's transform the list, with one-hot encoding
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(data).transform(data)
df = pd.DataFrame(a_data,columns=a.columns_)
df = df.replace(False,0)
df

Unnamed: 0,0005,0005.1,0014,0022,0024,0031,0044,0045,0053,0062,...,3465,3471,3475,3494,3498,3500,3502,3504,3514,Data analysis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df2 = df

In [10]:
#set a threshold value for the support value and calculate the support value.
apriori_t = apriori(df2, min_support = 0.01, use_colnames = True, verbose = 1)
apriori_t



Processing 155 combinations | Sampling itemset size 54


Unnamed: 0,support,itemsets
0,0.011225,( 0014)
1,0.011225,( 0236)
2,0.014032,( 0257)
3,0.012629,( 0272)
4,0.013096,( 0364)
...,...,...
189,0.013096,"( 3469, 2993, 3108, 2842)"
190,0.013096,"( 2993, 3503, 3108, 2842)"
191,0.013096,"( 2993, 3469, 3503, 2842)"
192,0.013096,"( 3469, 3503, 3108, 2842)"


In [11]:
# Let's view our interpretation values using the Associan rule function.
df_ar = association_rules(apriori_t, metric = "confidence", min_threshold = 0.6)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( 0014),( 1289),0.011225,0.028064,0.011225,1.000000,35.633333,0.010910,inf,0.982971
1,( 0236),( 1115),0.011225,0.012161,0.011225,1.000000,82.230769,0.011089,inf,0.999054
2,( 1115),( 0236),0.012161,0.011225,0.011225,0.923077,82.230769,0.011089,12.854069,1.000000
3,( 2542),( 0236),0.012161,0.011225,0.011225,0.923077,82.230769,0.011089,12.854069,1.000000
4,( 0236),( 2542),0.011225,0.012161,0.011225,1.000000,82.230769,0.011089,inf,0.999054
...,...,...,...,...,...,...,...,...,...,...
514,"( 3469, 3108)","( 3503, 2993, 2842)",0.013096,0.013096,0.013096,1.000000,76.357143,0.012925,inf,1.000000
515,( 2993),"( 3469, 3503, 3108, 2842)",0.013096,0.013096,0.013096,1.000000,76.357143,0.012925,inf,1.000000
516,(2842),"( 3469, 3503, 2993, 3108)",0.013096,0.013096,0.013096,1.000000,76.357143,0.012925,inf,1.000000
517,( 3108),"( 3469, 3503, 2993, 2842)",0.013096,0.013096,0.013096,1.000000,76.357143,0.012925,inf,1.000000


In [12]:
df_ar[:30]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( 0014),( 1289),0.011225,0.028064,0.011225,1.0,35.633333,0.01091,inf,0.982971
1,( 0236),( 1115),0.011225,0.012161,0.011225,1.0,82.230769,0.011089,inf,0.999054
2,( 1115),( 0236),0.012161,0.011225,0.011225,0.923077,82.230769,0.011089,12.854069,1.0
3,( 2542),( 0236),0.012161,0.011225,0.011225,0.923077,82.230769,0.011089,12.854069,1.0
4,( 0236),( 2542),0.011225,0.012161,0.011225,1.0,82.230769,0.011089,inf,0.999054
5,( 0236),(0342),0.011225,0.013564,0.011225,1.0,73.724138,0.011073,inf,0.997635
6,(0342),( 0236),0.013564,0.011225,0.011225,0.827586,73.724138,0.011073,5.734892,1.0
7,(0256),( 0257),0.013564,0.014032,0.013564,1.0,71.266667,0.013374,inf,0.999526
8,( 0257),(0256),0.014032,0.013564,0.013564,0.966667,71.266667,0.013374,29.593078,1.0
9,( 0488),( 0272),0.012629,0.012629,0.012629,1.0,79.185185,0.012469,inf,1.0


In [13]:
result = course.loc[course['Course_id'] == 1289]['Course Name']
print('antecedents:', result)

antecedents: 1288    datum visualization python
Name: Course Name, dtype: object


In [None]:
for n, antecedents in enumerate(df_ar['antecedents']):
    if isinstance(antecedents, frozenset):
        antecedents = list(antecedents)
    antecedent_ids = ','.join(map(str, antecedents)).strip()  # Chuyển thành danh sách và tách các ID

    antecedent_names = []
    for antecedent_id in antecedent_ids.split(','):
        antecedent_id = antecedent_id.strip()
        result = course.loc[course['Course_id'] == int(antecedent_id)]['Course Name'].iloc[0]
        antecedent_names.append(result)

    antecedent_names_str = ', '.join(antecedent_names)

    print(n)
    print('antecedents:', antecedent_names_str)

    consequents = df_ar['consequents'][n]
    if isinstance(consequents, frozenset):
        consequents = list(consequents)
    consequent_ids = ','.join(map(str, consequents)).strip()  # Chuyển thành danh sách và tách các ID

    consequent_names = []
    for consequent_id in consequent_ids.split(','):
        consequent_id = consequent_id.strip()
        result = course.loc[course['Course_id'] == int(consequent_id)]['Course Name'].iloc[0]
        consequent_names.append(result)

    consequent_names_str = ', '.join(consequent_names)

    print(consequent_names_str)
    print('consequents:', consequent_names_str, '\n', '-' * 30)


0
antecedents: business statistic analysis capstone
datum visualization python
consequents: datum visualization python 
 ------------------------------
1
antecedents: matrix method
precalculus periodic function
consequents: precalculus periodic function 
 ------------------------------
2
antecedents: precalculus periodic function
matrix method
consequents: matrix method 
 ------------------------------
3
antecedents: matrix method
logic economist
consequents: logic economist 
 ------------------------------
4
antecedents: logic economist
matrix method
consequents: matrix method 
 ------------------------------
5
antecedents: game theory python
matrix method
consequents: matrix method 
 ------------------------------
6
antecedents: matrix method
game theory python
consequents: game theory python 
 ------------------------------
7
antecedents: art music production
art music production
consequents: art music production 
 ------------------------------
8
antecedents: art music production
a

82
antecedents: nlp twitter sentiment analysis
transfer learn nlp tensorflow hub
consequents: transfer learn nlp tensorflow hub 
 ------------------------------
83
antecedents: game theory python
logic economist
consequents: logic economist 
 ------------------------------
84
antecedents: logic economist
game theory python
consequents: game theory python 
 ------------------------------
85
antecedents: introduction calculus
analytic combinatoric
consequents: analytic combinatoric 
 ------------------------------
86
antecedents: analytic combinatoric
introduction calculus
consequents: introduction calculus 
 ------------------------------
87
antecedents: image video processing mar hollywood stop hospital
precalculus relation function
consequents: precalculus relation function 
 ------------------------------
88
antecedents: precalculus relation function
image video processing mar hollywood stop hospital
consequents: image video processing mar hollywood stop hospital 
 ------------------

In [84]:
def recommend_courses(enrolled_courses, df_ar, num_recommendations=5):
    # Tạo một danh sách để lưu trữ các khoá học được đề xuất
    recommended_courses = []

    # Duyệt qua từng tập luật kết hợp trong df_ar
    for index, row in df_ar.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']

        # Chuyển các ID trong antecedents thành các ID không có khoảng cách
        antecedents_cleaned = [course_id.replace(" ", "") for course_id in antecedents]

        # Kiểm tra nếu có ít nhất một khoá học từ antecedents có trong danh sách enrolled_courses
        if any(course_id in antecedents_cleaned for course_id in enrolled_courses):
            # Lấy danh sách các khoá học trong consequents
            recommended_courses.extend(consequents)

    # Loại bỏ các khoá học đã đăng ký và lặp lại
    recommended_courses = list(set(recommended_courses) - set(enrolled_courses))
    print(recommended_courses)

    # Chọn một số lượng giới hạn của khoá học để đề xuất
    if len(recommended_courses) > num_recommendations:
        recommended_courses = recommended_courses[:num_recommendations]

    return recommended_courses

# Danh sách các khoá học mà người dùng đã đăng ký
enrolled_courses = ["0966", "1677", "1109"]

# Đề xuất các khoá học dựa trên danh sách đã đăng ký và bảng df_ar
recommended_courses = recommend_courses(enrolled_courses, df_ar)
course_dict = dict(zip(course['Course_id'], course['Course Name']))
# In ra các khoá học được đề xuất
print("Enrolled Courses:")
for course_id in enrolled_courses:
    clean_course_id = course_id.replace(" ", "")  # Loại bỏ dấu cách
    course_name = course_dict.get(int(clean_course_id), 'Not Found')  # Lấy tên khoá học từ từ điển course_dict
    print(f"- {course_name} (ID: {clean_course_id})")
    
# In ra các khoá học được đề xuất, nhưng chỉ hiển thị những khoá học chưa được đăng ký
print("Recommend Courses:")
for course_id in recommended_courses:
    clean_course_id = course_id.replace(" ", "")  # Loại bỏ dấu cách
    course_name = course_dict.get(int(clean_course_id), 'Not Found')  # Lấy tên khoá học từ từ điển course_dict
    
    # Kiểm tra xem khoá học đã được đăng ký hay chưa
    if clean_course_id not in enrolled_courses:
        print(f"- {course_name} (ID: {clean_course_id})")


[' 1109', ' 2497']
Enrolled Courses:
- interactive word embedding use word vec plotly (ID: 0966)
- nlp twitter sentiment analysis (ID: 1677)
- sentiment analysis deep learning use bert (ID: 1109)
Recommend Courses:
- transfer learn nlp tensorflow hub (ID: 2497)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


----
<a id="11"></a>

### 2. Similar Courses


In [16]:
import torch
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
import en_core_web_sm
spc_en = en_core_web_sm.load()
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

  if not hasattr(tensorboard, "__version__") or LooseVersion(
  ) < LooseVersion("1.15"):


In [18]:
# Define a function to preprocess text
# spc_en = en_core_web_sm.load()

def preprocess_text(text):
    stopwords_eng = stopwords.words("english")
    text = text.lower()
    text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
    text = re.sub(r"[\W\d_]+", " ", text)
    text = [pal for pal in text.split() if pal not in stopwords_eng]
    spc_text = spc_en(" ".join(text))
    tokens = [word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in spc_text]
    return " ".join(tokens)

In [None]:
def cosine_similarity(vector1, vector2):
    # Chuyển đổi các vector cột thành vector hàng
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)
    
    dot_product = np.dot(vector1, vector2.T)  # Sử dụng vector thứ hai chuyển vị
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm1 * norm2)
    return similarity[0, 0]

In [None]:
def euclidean_distance(vector1, vector2):
    # Tính hiệu của hai vector
    diff = vector1 - vector2
    
    # Tính khoảng cách Euclidean bằng cách tính norm của hiệu
    distance = np.linalg.norm(diff)
    return distance

In [19]:
# Hàm mã hoá câu với BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def encode_sequence_with_bert(sequence):
    # Tiền xử lý và mã hoá câu
    input_ids = tokenizer(preprocess_text(sequence), return_tensors="pt").input_ids
    with torch.no_grad():
        outputs = model(input_ids)
        hidden_states = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    return hidden_states

def nearest_bert(new_sentence):
    # Tiền xử lý và mã hoá câu mới với BERT
    encoded_new_sentence = encode_sequence_with_bert(new_sentence)

    # Chuyển đổi chuỗi số thực từ cột BERT_Encoded thành ma trận 2D
    bert_encoded_matrix = course['BERT_Encoded'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
    bert_encoded_matrix = np.vstack(bert_encoded_matrix)

    # Tạo mô hình KNN với k=5 (tìm 5 câu giống nhất)
    knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
    knn_model.fit(bert_encoded_matrix)

    # Tìm 5 câu giống với câu mới nhất trong BERT_Encoded
    nearest_neighbors = knn_model.kneighbors([encoded_new_sentence], n_neighbors=5)

    print("Description:")
    print(new_sentence)
    print("\nNearest Course:")
    for i, neighbor_index in enumerate(nearest_neighbors[1][0]):
        neighbor_sequence = course.at[neighbor_index, "Course Name"]
        neighbor_specialized = course.at[neighbor_index, "Specialized"]

        print(f"Neighbor {i + 1}: {neighbor_sequence} -- Specialized: {neighbor_specialized}")

# Chuỗi câu mới
new_sentence = """
Linear regression, build an application
"""
nearest_bert(new_sentence)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Description:

Linear regression, build an application


Nearest Course:
Neighbor 1: practical machine learning -- Specialized: Data Science
Neighbor 2: practical machine learning -- Specialized: Data Science
Neighbor 3: introduction programming swift -- Specialized: Computer Science
Neighbor 4: apply social network analysis python -- Specialized: Data Science
Neighbor 5: advance linear model datum science least square -- Specialized: Data Science


In [63]:
vec1 = course[course["Course_id"] == 4]["IDF"]
vec1 = course[course["Course_id"] == 4]["IDF"]

# cosine_similarityfrom sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

# Đọc dữ liệu khoá học
course = pd.read_csv("../data/course.csv", index_col=0)

# Load cột IDF đã mã hoá
idf_encoded_matrix = course["BERT_Encoded"].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
idf_encoded_matrix = np.vstack(idf_encoded_matrix)

# Tạo mô hình KNN với k=5 (tìm 5 khoá gần nhất)
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(idf_encoded_matrix)

def find_nearest_courses(course_id, num):
    # Tìm vector IDF của khoá học cụ thể
    course_index = course[course["Course_id"] == course_id].index[0]
    course_idf_vector = idf_encoded_matrix[course_index]

    # Tìm 5 khoá gần nhất với khoá học cụ thể
    nearest_neighbors = knn_model.kneighbors([course_idf_vector], n_neighbors=num)

    # print(f"Course ID: {course_id}")
    # print("\nNearest Courses:")
    
    recommended_course_ids = []
    for i, neighbor_index in enumerate(nearest_neighbors[1][0]):
        neighbor_course_id = course.at[neighbor_index, "Course_id"]
        recommended_course_ids.append(neighbor_course_id)
        neighbor_course_name = course.at[neighbor_index, "Course Name"]
        # print(f"Neighbor {i + 1}: Course ID {neighbor_course_id} -- {neighbor_course_name}")

    return recommended_course_ids
find_nearest_courses(1,10)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])


[1, 1482, 2733, 1630, 2012, 807, 615, 132, 2338, 1280]

In [82]:
def recommend_courses(enrolled_courses, df_ar, num_recommendations=5):
    # Tạo một danh sách để lưu trữ các khoá học được đề xuất
    recommended_courses = []

    # Duyệt qua từng tập luật kết hợp trong df_ar

    for index, row in df_ar.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']

        # Chuyển các ID trong antecedents thành các ID không có khoảng cách và thành chuỗi
        antecedents_cleaned = [str(course_id).replace(" ", "") for course_id in antecedents]

        # Kiểm tra nếu có ít nhất một khoá học từ antecedents có trong danh sách enrolled_courses
        if any(course_id in antecedents_cleaned for course_id in enrolled_courses):
            # Lấy danh sách các khoá học trong consequents
            recommended_courses.extend(consequents)

    # Loại bỏ các khoá học đã đăng ký và lặp lại
    recommended_courses = list(set(recommended_courses) - set(enrolled_courses))

    # Kiểm tra số lượng khoá học được đề xuất, nếu ít hơn 5, tìm thêm các khoá học gần nhất
    if len(recommended_courses) < num_recommendations and  len(recommended_courses) != 0 :

        num_to_find = (num_recommendations - len(recommended_courses)) / len(recommended_courses)
        
        for course_id in enrolled_courses:
            recommended_courses.extend(find_nearest_courses(int(course_id), int(num_to_find)))
        
    # Loại bỏ các khoá học trùng lặp (nếu có)
    recommended_courses = list(set(recommended_courses))
    # Chọn một số lượng giới hạn của khoá học để đề xuất
    if len(recommended_courses) > num_recommendations:
        recommended_courses = recommended_courses[:num_recommendations]

    return recommended_courses


# Sử dụng hàm để đề xuất khoá học
enrolled_courses = ["0966", "1677", "1109"]
recommended_courses = recommend_courses(enrolled_courses, df_ar)

course_dict = dict(zip(course['Course_id'], course['Course Name']))
# In ra các khoá học đã đăng ký
print("Enrolled Courses:")
for course_id in enrolled_courses:
    clean_course_id = course_id.replace(" ", "")
    course_name = course_dict.get(int(clean_course_id), 'Not Found')
    print(f"- {course_name} (ID: {clean_course_id})")

# In ra các khoá học được đề xuất
print("Recommended Courses:")
for course_id in recommended_courses:
    clean_course_id = str(course_id).replace(" ", "")
    course_name = course_dict.get(int(clean_course_id), 'Not Found')
    print(f"- {course_name} (ID: {clean_course_id})")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [102]:
def recommend_courses(enrolled_courses, df_ar, num_recommendations=5):

    recommended_courses = []

    for index, row in df_ar.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']

        antecedents_cleaned = [str(course_id).replace(" ", "") for course_id in antecedents]

        if any(course_id in antecedents_cleaned for course_id in enrolled_courses):
            recommended_courses.extend(consequents)

    recommended_courses = [str(course_id).replace(" ", "") for course_id in recommended_courses]
    recommended_courses = list(set(recommended_courses) - set(enrolled_courses))

    if len(recommended_courses) > num_recommendations:
        recommended_courses = recommended_courses[:num_recommendations]
    if len(recommended_courses) < num_recommendations:
        
        num_to_find = (num_recommendations - len(recommended_courses)) / (len(recommended_courses)-0.1)
        re = []
        for course_id in recommended_courses:
            nearest_courses = find_nearest_courses(int(course_id), 5)
            for i,course in enumerate(nearest_courses):
                if course not in enrolled_courses and course not in recommended_courses and int(course) != int(course_id):
                    re.append(course)
                
                if len(re) + 1 >= round(num_to_find)*(i+1):
                    break

    # recommended_courses = list(set(recommended_courses) - set(enrolled_courses) - set(re))
    [recommended_courses.append(i)for i in set(re)]

    # if len(recommended_courses) > num_recommendations:
    #     recommended_courses = recommended_courses[:num_recommendations]

    return recommended_courses

# Sử dụng hàm để đề xuất khoá học
enrolled_courses = ["0966", "1677", "1109"]
recommended_courses = recommend_courses(enrolled_courses, df_ar)

course_dict = dict(zip(course['Course_id'], course['Course Name']))

print("Enrolled Courses:")
for course_id in enrolled_courses:
    clean_course_id = course_id.replace(" ", "")
    course_name = course_dict.get(int(clean_course_id), 'Not Found')
    print(f"- {course_name} (ID: {clean_course_id})")

print("Recommended Courses:")
for course_id in recommended_courses:
    clean_course_id = str(course_id).replace(" ", "")
    course_name = course_dict.get(int(clean_course_id), 'Not Found')
    print(f"- {course_name} (ID: {clean_course_id})")


Enrolled Courses:
- interactive word embedding use word vec plotly (ID: 0966)
- nlp twitter sentiment analysis (ID: 1677)
- sentiment analysis deep learning use bert (ID: 1109)
Recommended Courses:
- transfer learn nlp tensorflow hub (ID: 2497)
- optimize tensorflow model deployment tensorrt (ID: 520)
- simple recurrent neural network kera (ID: 817)
- neural network scratch tensorflow (ID: 3370)
- neural network visualizer web app python (ID: 2546)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [71]:
round(1.5789473684210527)

2