In [11]:
!pip install nltk


Defaulting to user installation because normal site-packages is not writeable


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "the cat chased the mouse"
]

# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert to dense matrix and print
import pandas as pd

df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(df)


        cat    chased       dog       log       mat     mouse        on  \
0  0.374207  0.000000  0.000000  0.000000  0.492038  0.000000  0.374207   
1  0.000000  0.000000  0.468699  0.468699  0.000000  0.000000  0.356457   
2  0.381519  0.501651  0.000000  0.000000  0.000000  0.501651  0.000000   

        sat       the  
0  0.374207  0.581211  
1  0.356457  0.553642  
2  0.000000  0.592567  


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Sample documents
documents = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "the cat chased the mouse"
]

# ------------------------------
# CountVectorizer
# ------------------------------
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
print("🔸 Count Vectorizer Output:")
print(count_df)

# ------------------------------
# TfidfVectorizer
# ------------------------------
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\n🔹 TF-IDF Vectorizer Output:")
print(tfidf_df.round(2))  # Rounded for easier comparison



🔸 Count Vectorizer Output:
   cat  chased  dog  log  mat  mouse  on  sat  the
0    1       0    0    0    1      0   1    1    2
1    0       0    1    1    0      0   1    1    2
2    1       1    0    0    0      1   0    0    2

🔹 TF-IDF Vectorizer Output:
    cat  chased   dog   log   mat  mouse    on   sat   the
0  0.37     0.0  0.00  0.00  0.49    0.0  0.37  0.37  0.58
1  0.00     0.0  0.47  0.47  0.00    0.0  0.36  0.36  0.55
2  0.38     0.5  0.00  0.00  0.00    0.5  0.00  0.00  0.59


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Larger text documents
documents = [
    """Artificial intelligence is transforming industries across the globe. 
    From healthcare to finance, AI systems are enhancing efficiency and decision-making processes.""",

    """The rapid development of artificial intelligence raises important ethical questions. 
    Governments and organizations must work together to ensure AI benefits society fairly.""",

    """Machine learning, a subset of artificial intelligence, is powering many everyday applications. 
    Voice assistants, recommendation systems, and autonomous vehicles all rely on machine learning algorithms."""
]

# -------------------------
# Count Vectorizer
# -------------------------
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# -------------------------
# TF-IDF Vectorizer
# -------------------------
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display results
print("🔸 CountVectorizer Output:")
print(count_df.head(), "\n")

print("🔹 TfidfVectorizer Output:")
print(tfidf_df.round(2).head())


🔸 CountVectorizer Output:
   across  ai  algorithms  all  and  applications  are  artificial  \
0       1   1           0    0    1             0    1           1   
1       0   1           0    0    1             0    0           1   
2       0   0           1    1    1             1    0           1   

   assistants  autonomous  ...  society  subset  systems  the  to  together  \
0           0           0  ...        0       0        1    1   1         0   
1           0           0  ...        1       0        0    1   1         1   
2           1           1  ...        0       1        1    0   0         0   

   transforming  vehicles  voice  work  
0             1         0      0     0  
1             0         0      0     1  
2             0         1      1     0  

[3 rows x 53 columns] 

🔹 TfidfVectorizer Output:
   across    ai  algorithms  all   and  applications   are  artificial  \
0    0.24  0.18         0.0  0.0  0.14           0.0  0.24        0.14   
1    0.00  0.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents (simulate student submissions or papers)
documents = [
    "Artificial intelligence is transforming many industries and changing how we work.",
    "Artificial intelligence is changing how we work and transforming many industries.",
    "The history of AI dates back to the 1950s when the field was just beginning.",
    "This is a completely different paragraph unrelated to the others."
]

# Create TF-IDF vectors for all documents
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute cosine similarity between every pair of documents
similarity_matrix = cosine_similarity(tfidf_matrix)

# Display results
import pandas as pd

doc_labels = [f"Doc{i+1}" for i in range(len(documents))]
df = pd.DataFrame(similarity_matrix, index=doc_labels, columns=doc_labels)

print("🔍 Cosine Similarity Matrix:")
print(df.round(2))

# Threshold to flag as plagiarism
threshold = 0.8
print("\n⚠️ Potential Plagiarism Cases (similarity > 0.8):")
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        sim = similarity_matrix[i][j]
        if sim > threshold:
            print(f" - {doc_labels[i]} and {doc_labels[j]} are {sim:.2f} similar.")


🔍 Cosine Similarity Matrix:
      Doc1  Doc2  Doc3  Doc4
Doc1  1.00  1.00  0.00  0.06
Doc2  1.00  1.00  0.00  0.06
Doc3  0.00  0.00  1.00  0.22
Doc4  0.06  0.06  0.22  1.00

⚠️ Potential Plagiarism Cases (similarity > 0.8):
 - Doc1 and Doc2 are 1.00 similar.


In [25]:
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Download only needed NLTK data
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Simple tokenizer (no punkt dependency)
def simple_tokenize(text):
    # Extract words (alphanumeric sequences)
    return re.findall(r'\b\w+\b', text.lower())

def preprocess(text):
    tokens = simple_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    return lemmas

# Sample documents
documents = [
    "Artificial intelligence is transforming industries across the globe.",
    "Artificial intelligence raises important ethical questions for society.",
    "Artificial intelligence is transforming many industries and changing how we work.",
    "Artificial intelligence is changing how we work and transforming many industries.",
    "Machine learning is a subset of artificial intelligence used in many systems."
]

# Preprocess documents
processed_docs = [preprocess(doc) for doc in documents]

# Build vocabulary
vocab = sorted(set(word for doc in processed_docs for word in doc))

# Compute Term Frequency (TF)
def compute_tf(doc):
    tf_dict = {}
    word_count = len(doc)
    counts = Counter(doc)
    for word in vocab:
        tf_dict[word] = counts[word] / word_count if word_count > 0 else 0
    return tf_dict

# Compute Inverse Document Frequency (IDF)
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for word in vocab:
        doc_count = sum(1 for doc in docs if word in doc)
        idf_dict[word] = math.log((total_docs + 1) / (doc_count + 1)) + 1  # smoothed IDF
    return idf_dict

# Compute TF-IDF for all docs
tf_list = [compute_tf(doc) for doc in processed_docs]
idf = compute_idf(processed_docs)

tfidf_docs = []
for tf_doc in tf_list:
    tfidf_doc = {word: tf_doc[word] * idf[word] for word in vocab}
    tfidf_docs.append(tfidf_doc)

# Display TF-IDF matrix as a DataFrame
df = pd.DataFrame(tfidf_docs).T
df.columns = [f'Doc{i+1}' for i in range(len(documents))]

print("🔍 TF-IDF Matrix (Lemmatized, simple tokenizer):")
print(df.round(3))
tfidf_matrix = np.array([[tfidf_doc[word] for word in vocab] for tfidf_doc in tfidf_docs])

# Compute cosine similarity between documents
similarity_matrix = cosine_similarity(tfidf_matrix)

# Display similarity matrix
sim_df = pd.DataFrame(similarity_matrix, 
                      index=[f'Doc{i+1}' for i in range(len(documents))], 
                      columns=[f'Doc{i+1}' for i in range(len(documents))])

print("\n🔍 Document Similarity (Cosine Similarity based on TF-IDF):")
print(sim_df.round(3))



🔍 TF-IDF Matrix (Lemmatized, simple tokenizer):
               Doc1   Doc2   Doc3   Doc4   Doc5
across        0.350  0.000  0.000  0.000  0.000
artificial    0.167  0.143  0.143  0.143  0.125
changing      0.000  0.000  0.242  0.242  0.000
ethical       0.000  0.300  0.000  0.000  0.000
globe         0.350  0.000  0.000  0.000  0.000
important     0.000  0.300  0.000  0.000  0.000
industry      0.234  0.000  0.201  0.201  0.000
intelligence  0.167  0.143  0.143  0.143  0.125
learning      0.000  0.000  0.000  0.000  0.262
machine       0.000  0.000  0.000  0.000  0.262
many          0.000  0.000  0.201  0.201  0.176
question      0.000  0.300  0.000  0.000  0.000
raise         0.000  0.300  0.000  0.000  0.000
society       0.000  0.300  0.000  0.000  0.000
subset        0.000  0.000  0.000  0.000  0.262
system        0.000  0.000  0.000  0.000  0.262
transforming  0.234  0.000  0.201  0.201  0.000
used          0.000  0.000  0.000  0.000  0.262
work          0.000  0.000  0.242  0.242

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ehmed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ehmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#Naive bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

texts = ["free money now", "hello how are you", "win big cash", "meeting tomorrow"]
labels = ["spam", "ham", "spam", "ham"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

model = MultinomialNB()
model.fit(X, labels)

# Predict
new_text = ["free cash offer"]
new_X = vectorizer.transform(new_text)
print(model.predict(new_X))


['spam']


In [25]:
import re

def custom_tokenize(text):
    # Pattern explanation:
    # \d+,\d+ or \d+\.\d+  -> decimal numbers with comma or dot (e.g. 2,5 or 2.9)
    # \d+                  -> integers
    # \w+                  -> words (letters, digits, underscores)
    # [%$€]                 -> symbols like % $ €
    # [=+\-*/()]            -> math operators and parentheses
    # [^\w\s]               -> any punctuation or symbol not matched above (like :, ;, etc.)
    
    pattern = r'\d+,\d+|\d+\.\d+|\d+|\w+|[%$€]|[=+\-*/()]|[^\w\s]'
    
    tokens = re.findall(pattern, text)
    return tokens

# Your text examples (combined into one string)
text = """
1. AŞPA həqiqətləri görmür: Ermənistanda mediaya hücumlara göz yumulur - ŞƏRH + FOTO
2. A.Məlikli S.Bağırova deyib ki, “Biz bu məlumatlara 154.5$ pul xərcləmişik.
3. “İnformasiyanı yaymaq hüququnun pozulması ilə bağlı 2023-cü ildə 135 fakt qeydə alınıb. 
4. Ədədlər nəzəriyyəsinin inkişafı ədədlər, yəni natural (N), tam (Z) və rasional (Q) ədədlər üzərindəki əməllərlə başladı.
5. Xəbərlərin 20:00 buraxılışı
6. Böyük Səlcuq imperatorluğu faktiki olaraq ayrı – ayrı dövlətlərə parçalandı:
7. a=5,1; b=2.9 olduqda a2-4a+ab-4b ifadəsinin qiymətini tapın.
8. Kater çay axını ilə 16 km/saat, axına qarşı isə 10km/saat sürətlə gedir.
9. Ədədi 2,5 dəfə artırsaq, ədəd neçə faiz artar?
10. Əgər malın qiyməti: a) 40% ucuzlaşarsa; b)50.5 % bahalaşarsa; malın indiki qiyməti neçə faiz olar?
"""

tokens = custom_tokenize(text)
print(tokens)


['1', '.', 'AŞPA', 'həqiqətləri', 'görmür', ':', 'Ermənistanda', 'mediaya', 'hücumlara', 'göz', 'yumulur', '-', 'ŞƏRH', '+', 'FOTO', '2', '.', 'A', '.', 'Məlikli', 'S', '.', 'Bağırova', 'deyib', 'ki', ',', '“', 'Biz', 'bu', 'məlumatlara', '154.5', '$', 'pul', 'xərcləmişik', '.', '3', '.', '“', 'İnformasiyanı', 'yaymaq', 'hüququnun', 'pozulması', 'ilə', 'bağlı', '2023', '-', 'cü', 'ildə', '135', 'fakt', 'qeydə', 'alınıb', '.', '4', '.', 'Ədədlər', 'nəzəriyyəsinin', 'inkişafı', 'ədədlər', ',', 'yəni', 'natural', '(', 'N', ')', ',', 'tam', '(', 'Z', ')', 'və', 'rasional', '(', 'Q', ')', 'ədədlər', 'üzərindəki', 'əməllərlə', 'başladı', '.', '5', '.', 'Xəbərlərin', '20', ':', '00', 'buraxılışı', '6', '.', 'Böyük', 'Səlcuq', 'imperatorluğu', 'faktiki', 'olaraq', 'ayrı', '–', 'ayrı', 'dövlətlərə', 'parçalandı', ':', '7', '.', 'a', '=', '5,1', ';', 'b', '=', '2.9', 'olduqda', 'a2', '-', '4', 'a', '+', 'ab', '-', '4', 'b', 'ifadəsinin', 'qiymətini', 'tapın', '.', '8', '.', 'Kater', 'çay', 'axın