In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
data = pd.read_csv('Data/cleaned_dataset.csv')

Logistic Regression:

Use Case: It's used for binary classification tasks. In my case, predicting 'Is Halal' as 1 (true) or 0 (false) is a classic example of binary classification.
Output: Logistic Regression models the probability of the default class (e.g., 'Is Halal' = 1). It outputs values between 0 and 1, which are the probabilities that a given input point belongs to a particular class.
Sigmoid Function: It uses a logistic function (sigmoid function) to squeeze the output of a linear equation between 0 and 1.

Linear Regression:
Use Case: It's used for regression tasks where the target variable is continuous. For instance, predicting the price of a house or the temperature for a day.
Output: Linear Regression predicts a value along a continuous range.


Since your goal is to classify an item as either Halal or not, which is a categorical outcome, Logistic Regression is the correct model to use. Linear Regression would be inappropriate as it predicts continuous values, not categories.

In [12]:
data.head()

Unnamed: 0,ID,Title,Description,Practicing Level,Permissible For,Is Halal,Tags
0,___1d7BUUHY,reel instagram sab viral video short short vid...,,0,0,0,
1,___1xVwaG14,इंडियन आर्मी के जवानों को शराब क्यों पिलाया जा...,video given answer government give alcohol sol...,0,0,0,alcohol soldier drink alcohol indian army liqu...
2,___1y_4dOq8,sit prank epic reaction public prank onpublicprnk,sit prank epic reaction public prank dontsithe...,0,0,0,ball seat prank best prank video comedy prank ...
3,___3Si8H-iU,ai ki madad se youtubeshorts short ai,hidayat youtubeshorts ai short fire,0,0,1,ai fire hidayat short trending youtube short
4,___3tLmXbF4,زياد قرر يشتغل نادل بمطعم حرام عالزباين,مقالب شكل مجموعة مقالب بالفنانين برامج الكامير...,0,0,0,ziad sahtout زياد سحتوت فنانين كاميرا خفية مقا...


## Exploratory Data Analysis

In [13]:
# Handling Missing Values and Text Preprocessing
# Lowercasing all textual columns and calculating missing values in one step
text_columns = ['Title', 'Description', 'Tags']
data[text_columns] = data[text_columns].apply(lambda x: x.str.lower())
missing_values = data[text_columns].isnull().sum()

# Check the distribution of the 'Is Halal' column
halal_distribution = data['Is Halal'].value_counts(normalize=True)

print("Missing Values:\n", missing_values, "\n")
print("Halal Distribution:\n", halal_distribution)

Missing Values:
 Title            148
Description    12471
Tags           22619
dtype: int64 

Halal Distribution:
 Is Halal
0    0.672838
1    0.327162
Name: proportion, dtype: float64


In [14]:
# Handling Missing Values
data['Title'].fillna('no title', inplace=True)

# EDA: Analyzing distributions and negative values more efficiently
columns_to_analyze = ['Practicing Level', 'Permissible For']
for column in columns_to_analyze:
    distribution = data[column].value_counts(normalize=True)
    print(f"{column} Distribution:\n", distribution)
    if column == 'Permissible For':
        negative_values = data.loc[data[column] < 0, column].value_counts()
        print(f"Negative Values in '{column}':\n", negative_values)

Practicing Level Distribution:
 Practicing Level
0    0.836338
1    0.150118
3    0.009441
2    0.004103
Name: proportion, dtype: float64
Permissible For Distribution:
 Permissible For
 0    0.836338
 1    0.144485
 3    0.014721
 4    0.004426
-4    0.000029
Name: proportion, dtype: float64
Negative Values in 'Permissible For':
 Permissible For
-4    2
Name: count, dtype: int64


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Combining feature engineering and data preparation steps for efficiency
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for simplicity
X = tfidf_vectorizer.fit_transform(data['Title'])  # Vectorizing 'Title' and assigning to X
y = data['Is Halal']  # Assigning target variable to y

# Splitting the Data into Training and Testing Sets in a single step
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Displaying the shape of the training and testing sets directly
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

Training set shape: (54400, 1000), (54400,)
Testing set shape: (13600, 1000), (13600,)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model Building: Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predicting on the Test Set
y_pred = log_reg.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.7274264705882353,
 0.6607452339688041,
 0.34277365700157336,
 0.45138375018499327)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Combining 'Title' and 'Description' into a single text feature for vectorization
data['Combined_Text'] = data['Title'] + " " + data['Description'].fillna('')  # Fill NaN descriptions with empty strings

# Reducing the feature complexity and sampling the data
tfidf_vectorizer_reduced = TfidfVectorizer(max_features=500)  # Reducing to 500 features
combined_text_tfidf_reduced = tfidf_vectorizer_reduced.fit_transform(data['Combined_Text'])

# Sampling a subset of the data (e.g., 10% of the data)
data_sampled = data.sample(frac=0.1, random_state=42)
combined_text_tfidf_sampled = tfidf_vectorizer_reduced.transform(data_sampled['Combined_Text'])

X_sampled = combined_text_tfidf_sampled
y_sampled = data_sampled['Is Halal']

# Splitting the Sampled Data into Training and Testing Sets
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled
)

# Training the Random Forest model on the sampled data
rf_model_sampled = RandomForestClassifier(random_state=42)
rf_model_sampled.fit(X_train_sampled, y_train_sampled)

# Predicting on the Test Set of the sampled data
y_pred_sampled = rf_model_sampled.predict(X_test_sampled)

# Evaluating the new model on the sampled data
accuracy_sampled = accuracy_score(y_test_sampled, y_pred_sampled)
precision_sampled = precision_score(y_test_sampled, y_pred_sampled)
recall_sampled = recall_score(y_test_sampled, y_pred_sampled)
f1_sampled = f1_score(y_test_sampled, y_pred_sampled)

accuracy_sampled, precision_sampled, recall_sampled, f1_sampled

(0.7014705882352941, 0.5714285714285714, 0.350561797752809, 0.4345403899721448)

In [18]:
from sklearn.model_selection import cross_val_score

# Simplified hyperparameter tuning
# Adjusting a smaller set of key parameters manually
rf_model_simplified = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Performing cross-validation with the simplified model
cv_scores = cross_val_score(rf_model_simplified, X_train_sampled, y_train_sampled, cv=5)

# Average cross-validation score
cv_average_score = cv_scores.mean()
cv_scores, cv_average_score
print("Cross-Validation Scores:", cv_scores)
print("Average Cross-Validation Score:", cv_average_score)

Cross-Validation Scores: [0.69485294 0.69761029 0.70496324 0.703125   0.70220588]
Average Cross-Validation Score: 0.7005514705882353


In [19]:
# Further reducing the size of the data sample and the number of features
data_more_sampled = data.sample(frac=0.05, random_state=42)  # Reducing to 5% of the data
tfidf_vectorizer_more_reduced = TfidfVectorizer(max_features=300)  # Reducing to 300 features
combined_text_tfidf_more_reduced = tfidf_vectorizer_more_reduced.fit_transform(data_more_sampled['Combined_Text'])

X_more_sampled = combined_text_tfidf_more_reduced
y_more_sampled = data_more_sampled['Is Halal']

# Splitting the more sampled data into training and testing sets
X_train_more_sampled, X_test_more_sampled, y_train_more_sampled, y_test_more_sampled = train_test_split(
    X_more_sampled, y_more_sampled, test_size=0.2, random_state=42, stratify=y_more_sampled
)

# Training Logistic Regression model on the more sampled data
log_reg_more_sampled = LogisticRegression(max_iter=1000, random_state=42)
log_reg_more_sampled.fit(X_train_more_sampled, y_train_more_sampled)

# Predicting on the Test Set of the more sampled data
y_pred_more_sampled = log_reg_more_sampled.predict(X_test_more_sampled)

# Evaluating the Logistic Regression model on the more sampled data
accuracy_more_sampled = accuracy_score(y_test_more_sampled, y_pred_more_sampled)
precision_more_sampled = precision_score(y_test_more_sampled, y_pred_more_sampled)
recall_more_sampled = recall_score(y_test_more_sampled, y_pred_more_sampled)
f1_more_sampled = f1_score(y_test_more_sampled, y_pred_more_sampled)

print("Accuracy:", accuracy_more_sampled)
print("Precision:", precision_more_sampled)
print("Recall:", recall_more_sampled)
print("F1 Score:", f1_more_sampled)

Accuracy: 0.6955882352941176
Precision: 0.5714285714285714
Recall: 0.2702702702702703
F1 Score: 0.3669724770642202


## Enhanced Text Preproccsing

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import nltk
import re

# Download stopwords from NLTK
nltk.download('stopwords')
from nltk.corpus import stopwords

# Enhanced preprocessing function
def preprocess_text(text):
    stemmer = PorterStemmer()
    words = re.sub(r"[^a-zA-Z]", " ", text).lower().split()
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words("english"))]
    return " ".join(words)

# Apply preprocessing to combined text
data['Processed_Combined_Text'] = data['Combined_Text'].apply(preprocess_text)

# Vectorization with TF-IDF on processed text
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(data['Processed_Combined_Text'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shakil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
# Ensure X_train and y_train are defined and are the correct format
if 'X_train' not in locals() or 'y_train' not in locals():
    raise ValueError("X_train and y_train need to be defined before fitting the model.")

grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.7248529325429064


### Video Recommendation System

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd  # Ensure pandas is imported

# Assuming video_indices is a dictionary mapping video IDs to their index in 'X'
video_indices = {vid: idx for idx, vid in enumerate(data['ID'])}

def recommend_videos(video_id, X, video_indices, data, num_recommendations=5):
    if video_id not in video_indices:
        return "Video ID not found in the dataset."

    idx = video_indices[video_id]
    is_halal = data.loc[data['ID'] == video_id, 'Is Halal'].iloc[0]
    filtered_data = data[data['Is Halal'] == is_halal]
    filtered_indices = pd.Series(filtered_data.index, index=filtered_data['ID']).drop_duplicates()
    filtered_data = filtered_data[filtered_data['ID'] != video_id]
    filtered_X = X[filtered_indices.values]

    cos_sim = cosine_similarity(X[idx:idx+1], filtered_X).flatten()
    sorted_indices = np.argsort(cos_sim)[::-1]  # Sorted in descending order

    recommendations = []
    for index in sorted_indices[:num_recommendations]:
        recommended_video = filtered_data.iloc[index]
        # Check if Tags is not NaN
        if not pd.isna(recommended_video['Tags']):
            recommendations.append(recommended_video[['ID', 'Title', 'Description', 'Tags']])

    return recommendations

# Example usage
video_id = '___1d7BUUHY'
recommendations = recommend_videos(video_id, X, video_indices, data)
for rec in recommendations:
    print(rec)
    print("\n")

ID                                                   ___1xVwaG14
Title          इंडियन आर्मी के जवानों को शराब क्यों पिलाया जा...
Description    video given answer government give alcohol sol...
Tags           alcohol soldier drink alcohol indian army liqu...
Name: 1, dtype: object


ID                          _CWNbBa7QC8
Title                        ciao italy
Description    join channel access perk
Tags                kamiladawn miladawn
Name: 58966, dtype: object


ID                                                   _33ro4bBOSk
Title          help help short drawing cartoon story xiaolind...
Description    tiktok instagram business email linoypeng gmai...
Tags           kunst animation art artist cartoon draw drawin...
Name: 16919, dtype: object


ID                                                   _4a5Kc_Uys4
Title          premer jadukor momtaz ashraf প্রেমের জাদুকর mu...
Description    plz subscribe song premer jadukor প্রেমের জাদু...
Tags           ashraf udash romantic song 

In [23]:
def recommend_halal_videos(video_id, X, video_indices, data, num_recommendations=5):
    # Check if the video ID is in the dataset
    if video_id not in video_indices:
        return "Video ID not found in the dataset."

    # Get the index of the video in the TF-IDF matrix
    idx = video_indices[video_id]

    # Filter the dataset for Halal videos
    filtered_data = data[data['Is Halal'] == 1]

    # Update indices for the filtered data
    filtered_indices = pd.Series(filtered_data.index, index=filtered_data['ID']).drop_duplicates()

    # Ensure the video itself is not recommended
    filtered_data = filtered_data[filtered_data['ID'] != video_id]

    # Update the TF-IDF matrix for the filtered data
    filtered_X = X[filtered_indices.values]

    # Calculate cosine similarity once
    cos_sim = cosine_similarity(X[idx:idx+1], filtered_X).flatten()

    # Get the indices of the videos sorted by similarity in descending order
    sorted_indices = np.argsort(cos_sim)[::-1]

    recommendations = []
    count = 0

    # Fetch recommendations until the desired number is reached
    for index in sorted_indices:
        if count >= num_recommendations:
            break

        recommended_video = filtered_data.iloc[index]

        # Check if the description is not nan
        if not pd.isna(recommended_video['Description']):
            recommendations.append({
                "Video ID": recommended_video.name,  # Adding video_id to the recommendation
                "Title": recommended_video['Title'],
                "Description": recommended_video['Description'],
                "Tags": recommended_video['Tags']
            })
            count += 1

    return recommendations

# Example usage
video_id = '___1d7BUUHY'  # Replace with actual video ID
recommendations = recommend_halal_videos(video_id, X, video_indices, data)
for rec in recommendations:
    print(f"Video ID: {rec['Video ID']}")  # Displaying video_id
    print(f"Title: {rec['Title']}")
    print(f"Description: {rec['Description']}")
    print(f"Tags: {rec['Tags']}")
    print()

Video ID: 19796
Title: छोटे भाई के लिए बहन ने बनाया खिलौना short viral video trending shortsvideo viralvideo
Description: छोटे भाई के लिए बहन ने बनाया खिलौना short viral video trending shortsvideo viralvideo
Tags: nan

Video ID: 4069
Title: प्रधानमंत्री आदर्श ग्राम योजना क्या होती है
Description: प्रधानमंत्री आदर्श ग्राम योजना क्या होती है pradhan mantri adarsh gram yojna pmagy pmagy jmdajit reel trending foryou trendingvideos viral
Tags: nan

Video ID: 46410
Title: हे पदार्थ मूळव्याध मुळापासून बरा करतात mulvyadh short reel swagattodkar short ाठी pile
Description: हे पदार्थ मूळव्याध मुळापासून बरा करतात mulvyadh short reel swagattodkar short ाठी pile
Tags: nan

Video ID: 61642
Title: makkah makkah reel short viralvideo trendingshorts short
Description: makkah makkah reel short viralvideo trendingshorts short
Tags: nan

Video ID: 44657
Title: damainya suasana pedesaan wonosobo short
Description: bagi mereka yang berada di pedesaan terkadang pemandangan seperti ini adalah hal yang biasa t

## Recommand Halal Video even if the recieved video is Hallal

In [24]:
# from numpy import indices  # This import is causing the issue, it should be removed or renamed

def recommend_halal_videos(video_id, X, video_indices, data, num_recommendations=5):  # Renamed 'indices' parameter to 'video_indices' to avoid conflict
    if video_id not in video_indices:  # Changed 'indices' to 'video_indices'
        return "Video ID not found in the dataset."

    idx = video_indices[video_id]  # Changed 'indices' to 'video_indices'
    filtered_data = data[data['Is Halal'] == 1]
    filtered_indices = pd.Series(filtered_data.index, index=filtered_data['ID']).drop_duplicates()
    filtered_data = filtered_data[filtered_data['ID'] != video_id]
    filtered_X = X[filtered_indices.values]

    # Calculate cosine similarity once
    cos_sim = cosine_similarity(X[idx:idx+1], filtered_X).flatten()
    sorted_indices = np.argsort(cos_sim)[::-1]  # Sorted in descending order

    recommendations = []
    for index in sorted_indices:
        if len(recommendations) >= num_recommendations:
            break

        recommended_video = filtered_data.iloc[index]
        # Check if both description and tags are not NaN
        if not pd.isna(recommended_video['Description']) and not pd.isna(recommended_video['Tags']):
            recommendations.append({
                "Video ID": recommended_video.name,  # Adding video_id to the recommendation
                "Title": recommended_video['Title'],
                "Description": recommended_video['Description'],
                "Tags": recommended_video['Tags']
            })

    return recommendations

# Example usage
video_id = '___1d7BUUHY'  # Replace with actual video ID
recommendations = recommend_halal_videos(video_id, X, video_indices, data)  # Changed 'indices' to 'video_indices'
for rec in recommendations:
    print(f"Video ID: {rec['Video ID']}")  # Displaying video_id
    print(f"Title: {rec['Title']}")
    print(f"Description: {rec['Description']}")
    print(f"Tags: {rec['Tags']}")
    print("\n\n")  # Two lines of space

Video ID: 13641
Title: darul uloom waqf deoband darul uloom deoband deoband trending viral trend short short video
Description: darul uloom deoband waqf darul uloom deoband deoband darul uloom deoband waqf new video trending viral trend viralvideo video deoband shortvideo alazkardeoband hamdtvdeoband ulamaedeoband voiceofdeoband
Tags: darul uloom deoband darul uloom deoband waqf darul uloom waqf darul uloom waqf deoband deoband deoband ulamai deoband waqf darul uloom



Video ID: 38541
Title: vfx short video
Description: vfx short video
Tags: mobile edits photo director vfx vineeth creation



Video ID: 35882
Title: প্রিয় শায়খ আল্লাহ আপনার সহায় হোন।। short video।।
Description: প্রিয় শায়খ আল্লাহ আপনার সহায় হোন।। short video।।
Tags: allama mamunul hoque short video



Video ID: 15334
Title: gulkand lassi short video gulkandlassi
Description: gulkand lassi short video gulkandlassi mukeshgulkandlassi lassishort
Tags: gulkand lassi gulkand lassi shot video gulkand lassi video lassi mukesh g

In [25]:
def verify_halal_recommendations(data, recommendations):
    # Filter data to only include IDs present in recommendations
    recommended_data = data[data['ID'].isin([rec['Video ID'] for rec in recommendations])]
    
    # Check if all recommended entries are Halal
    non_halal_in_recommendations = recommended_data[recommended_data['Is Halal'] == 0]
    if non_halal_in_recommendations.empty:
        print("All recommended videos are Halal.")
    else:
        # Highlighting the presence of Non-Halal videos in the recommendations
        print(f"Warning: Found {len(non_halal_in_recommendations)} Non-Halal video(s) in the recommendations. This may indicate a need for model reevaluation.")
        # Printing IDs of Non-Halal videos
        print("Non-Halal Video IDs:", non_halal_in_recommendations['ID'].tolist())
    
    # Check if all halal_data entries have Title, Description, and Tags
    halal_data_valid = recommended_data.dropna(subset=['Title', 'Description', 'Tags'])
    if len(halal_data_valid) == len(recommended_data):
        print("All recommended Halal data entries have Title, Description, and Tags.")
    else:
        print("Some recommended Halal data entries are missing Title, Description, or Tags.")
    
    return halal_data_valid, non_halal_in_recommendations

# Example usage
recommendations = recommend_halal_videos(video_id, X, video_indices, data)  # Assuming this function returns a list of recommendations
halal_data_valid, non_halal_in_recommendations = verify_halal_recommendations(data, recommendations)
print(f"Valid Halal Entries in Recommendations: {len(halal_data_valid)}")
print(f"Non-Halal Entries in Recommendations: {len(non_halal_in_recommendations)}")

All recommended videos are Halal.
All recommended Halal data entries have Title, Description, and Tags.
Valid Halal Entries in Recommendations: 0
Non-Halal Entries in Recommendations: 0


In [26]:
from random import random


def recommend_halal_videos():
    # Implement logic to randomly select five Halal videos
    halal_videos = data[data['Is Halal'] == 1]
    return random.sample(list(halal_videos['ID']), min(len(halal_videos), 5))


In [27]:
from sklearn.metrics import accuracy_score

# Predicting on the test set
y_pred = grid_search.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Printing the accuracy
print("Model Accuracy:", accuracy)

Model Accuracy: 0.7295588235294118


In [28]:
# write a pickle file name check
import pickle

# Save the model to disk
filename = 'finalized_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf_model_sampled, file)

In [29]:

import pickle

# Save the TF-IDF vectorizer to a file
# Ensure that 'tfidf_vectorizer' is the variable name of your trained TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)