In [40]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import pandas as pd

In [7]:
flist = []
with open(r'./trec06p/trec06p/full/index', 'r') as f:
    flist = f.readlines()

In [13]:
flabel = [str.split()[0] for str in flist]
for i in range(len(flabel)):
    if flabel[i] == "spam":
        flabel[i] = 1
    else:
        flabel[i] = 0

In [23]:
print(flabel)

[0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 

In [35]:
ftxt = ["./trec06p/trec06p/data/" + str.split()[1].split('\n')[0] for str in flist]

for i in range(len(ftxt)):
    with open(ftxt[i], 'rb') as f:
        ftxt[i] = f.read()

In [37]:
data_size = len(ftxt)

In [38]:
data_size

37822

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(ftxt, flabel, test_size=0.2, random_state=42)

In [42]:
vectorizer = TfidfVectorizer(decode_error='ignore')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfdif = vectorizer.transform(X_test)

In [44]:
model = LogisticRegression()
model.fit(X_train_tfidf, Y_train)

In [45]:
y_pred = model.predict(X_test_tfdif)

In [47]:
accuracy = accuracy_score(Y_test, y_pred)

In [48]:
report = classification_report(Y_test, y_pred, target_names=['Ham', 'Spam'])

In [50]:
print(f"Accuracy: {accuracy:.5f}")
print("Classification Report:\n", report)

Accuracy: 0.99088
Classification Report:
               precision    recall  f1-score   support

         Ham       0.99      0.98      0.99      2534
        Spam       0.99      1.00      0.99      5031

    accuracy                           0.99      7565
   macro avg       0.99      0.99      0.99      7565
weighted avg       0.99      0.99      0.99      7565



In [51]:
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [52]:
import joblib

In [53]:
joblib.dump(model, 'spam_classifier_modelv0.joblib')

['spam_classifier_modelv0.joblib']

In [56]:
weights = model.coef_
intercept = model.intercept_

print("Model Weights (Coefficients):", weights, len(weights[0]))
print("Model Intercept:", intercept)

Model Weights (Coefficients): [[ 1.66140543e+00  3.29016081e+00 -4.39367956e-01 ... -3.01641753e-06
   4.33557855e-04  6.18531631e-02]] 1384719
Model Intercept: [0.54957606]


In [59]:
X_test_tfdif

<7565x1384719 sparse matrix of type '<class 'numpy.float64'>'
	with 1476664 stored elements in Compressed Sparse Row format>

In [61]:
import cv2

In [62]:
import numpy as np

In [63]:
import os

In [64]:
cat_dir = "./PetImages/Cat"
dog_dir = "./PetImages/Dog"

In [105]:
img_size = 256

In [92]:
def load_images_from_folder(folder, label):
    images = []
    labels = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (img_size, img_size))
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
            img = img.flatten()
            images.append(img)
            labels.append(label)
    return images, labels

In [106]:
cat_images, cat_labels = load_images_from_folder(cat_dir, label=0)
dog_images, dog_labels = load_images_from_folder(dog_dir, label=1)

X = np.array(cat_images + dog_images)
y = np.array(cat_labels + dog_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 214 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
Corrupt JPEG data: 2230 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 399 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 254 extraneous bytes before marker 0xd9
Corrupt JPEG data: 226 extraneous bytes before marker 0xd9


In [71]:
from sklearn.preprocessing import StandardScaler

In [107]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [108]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [110]:
y_pred = model.predict(X_train)

accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred, target_names=['Cat', 'Dog'])

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.78
Classification Report:
               precision    recall  f1-score   support

         Cat       0.78      0.79      0.78      9929
         Dog       0.79      0.78      0.78     10027

    accuracy                           0.78     19956
   macro avg       0.78      0.78      0.78     19956
weighted avg       0.78      0.78      0.78     19956



In [75]:
len(y_test)

4990

In [76]:
len(y_pred)

7565

In [89]:
weights = model.coef_
intercept = model.intercept_

print("Model Weights (Coefficients):", weights, len(weights[0]))
print("Model Intercept:", intercept)

Model Weights (Coefficients): [[ 0.02211209  0.02584454  0.0075863  ... -0.00390763  0.01295102
  -0.0075941 ]] 49152
Model Intercept: [0.06333582]


In [111]:
joblib.dump(model, 'dog_cat_modelv0.joblib')

['dog_cat_modelv0.joblib']

In [1]:
import joblib

In [2]:
model = joblib.load("./model.joblib")

In [3]:
weights = model.coef_

In [4]:
weights

array([[ 0.01821917,  0.01681465,  0.0265765 , ..., -0.01145733,
        -0.01045444,  0.00966401]])

In [5]:
list(weights)

[array([ 0.01821917,  0.01681465,  0.0265765 , ..., -0.01145733,
        -0.01045444,  0.00966401])]

In [18]:
w = model.coef_.tolist()[0]

In [14]:
offset = model.intercept_

In [16]:
offset.tolist()[0]

0.030177193164069173

In [19]:
len(w)

65536