In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
fake_path = '/content/drive/MyDrive/News dataset/NewsFake.csv'
true_path = '/content/drive/MyDrive/News dataset/NewsTrue.csv'
fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)
print("NewsFake Dataset:")
print(fake_df.head(3))
print("\nNewsTrue Dataset:")
print(true_df.head(3))

NewsFake Dataset:
                                               title  \
0   donald trump sends out embarrassing new year’...   
1   drunk bragging trump staffer started russian ...   
2   sheriff david clarke becomes an internet joke...   

                                                text subject  \
0  donald trump just couldn t wish all americans ...    news   
1  house intelligence committee chairman devin nu...    news   
2  on friday, it was revealed that former milwauk...    news   

                date                                       cleaned_text  \
0  december 31, 2017  donald trump just couldn t wish all americans ...   
1  december 31, 2017  house intelligence committee chairman devin nu...   
2  december 30, 2017  on friday it was revealed that former milwauke...   

                                   text_no_stopwords  \
0  donald trump wish americans happy new year lea...   
1  house intelligence committee chairman devin nu...   
2  friday revealed former milwa

In [None]:
total_rows = fake_df.shape[0] + true_df.shape[0]
print(f"Total rows across both datasets: {total_rows}")


Total rows across both datasets: 44898


**POS**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng') # Download the required data package

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
!pip install spacy
import spacy

# Download the English language model if you haven't already
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
fake_df_sample = fake_df.sample(1000)  # Use 1,000 rows
true_df_sample = true_df.sample(1000)

fake_df_sample['pos_tags'] = spacy_pos_tagging_batch(fake_df_sample['text'])
true_df_sample['pos_tags'] = spacy_pos_tagging_batch(true_df_sample['text'])


In [None]:
print("Fake Dataset Sample with POS Tags:")
print(fake_df_sample[['text', 'pos_tags']].head())

print("\nTrue Dataset Sample with POS Tags:")
print(true_df_sample[['text', 'pos_tags']].head())


Fake Dataset Sample with POS Tags:
                                                    text  \
16905  every american should be demanding answers abo...   
6055   are we a post-racial nation? no, we re not, no...   
1096   ever since his presidential campaign, donald t...   
19143  brigitte gabriel is a christian who, after bei...   
13041  oh what a tangled web we weave, when first we ...   

                                                pos_tags  
16905  [DET, NOUN, AUX, AUX, VERB, NOUN, ADP, SCONJ, ...  
6055   [AUX, PRON, DET, ADJ, ADJ, ADJ, NOUN, PUNCT, I...  
1096   [ADV, SCONJ, PRON, ADJ, NOUN, PUNCT, PROPN, PR...  
19143  [PROPN, PROPN, AUX, DET, NOUN, PRON, PUNCT, AD...  
13041  [INTJ, PRON, DET, VERB, NOUN, PRON, VERB, PUNC...  

True Dataset Sample with POS Tags:
                                                    text  \
13265  paris (reuters) - france won t let the officia...   
5319   washington (reuters) - u.s. president donald t...   
15092  danang, vietnam (reuters) -

In [None]:
fake_df_sample.to_csv('/content/processed_fake_sample.csv', index=False)
true_df_sample.to_csv('/content/processed_true_sample.csv', index=False)


In [None]:
from collections import Counter

def pos_frequency(pos_tags):
    # Modified to handle potential variations in pos_tags structure
    pos_list = []
    for item in pos_tags:  # Iterate through elements of pos_tags
        if isinstance(item, tuple) and len(item) >= 2:  # Check if item is a tuple with at least 2 elements
            pos_list.append(item[1])  # Append the second element (POS tag)
        elif isinstance(item, str):  # Check if item is a string (possibly just the POS tag)
            pos_list.append(item)
    return Counter(pos_list)  # Return the frequency count

fake_sample_pos_freq = fake_df_sample['pos_tags'].apply(pos_frequency)
true_sample_pos_freq = true_df_sample['pos_tags'].apply(pos_frequency)

print("POS Frequency in Fake Dataset Sample:")
print(fake_sample_pos_freq.head())

print("\nPOS Frequency in True Dataset Sample:")
print(true_sample_pos_freq.head())

POS Frequency in Fake Dataset Sample:
16905    {'DET': 68, 'NOUN': 151, 'AUX': 51, 'VERB': 91...
6055     {'AUX': 34, 'PRON': 64, 'DET': 48, 'ADJ': 43, ...
1096     {'ADV': 23, 'SCONJ': 15, 'PRON': 46, 'ADJ': 22...
19143    {'PROPN': 85, 'AUX': 29, 'DET': 72, 'NOUN': 10...
13041    {'INTJ': 1, 'PRON': 39, 'DET': 78, 'VERB': 86,...
Name: pos_tags, dtype: object

POS Frequency in True Dataset Sample:
13265    {'PROPN': 40, 'PUNCT': 39, 'VERB': 52, 'DET': ...
5319     {'PROPN': 84, 'PUNCT': 117, 'VERB': 91, 'PRON'...
15092    {'PROPN': 18, 'PUNCT': 12, 'ADJ': 10, 'NOUN': ...
1947     {'PUNCT': 66, 'NOUN': 99, 'DET': 39, 'ADJ': 33...
14177    {'PROPN': 11, 'PUNCT': 8, 'ADJ': 5, 'VERB': 14...
Name: pos_tags, dtype: object


In [None]:
from sklearn.feature_extraction import DictVectorizer

# Convert the 'pos_tags' to dictionaries for vectorization
# Extract only the token and POS tag from each tuple
combined_sample['pos_features'] = combined_sample['pos_tags'].apply(lambda tags: {tag[0]: tag[1] for tag in tags if isinstance(tag, tuple) and len(tag) >= 2})

# Vectorize the POS frequency features
vectorizer = DictVectorizer(sparse=False)  # Use sparse=True for large datasets
X = vectorizer.fit_transform(combined_sample['pos_features'])
y = combined_sample['label']

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the size of each set
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 1600 samples
Test set size: 400 samples


In [None]:
print(combined_sample['pos_tags'].head())



0    [DET, NOUN, AUX, AUX, VERB, NOUN, ADP, SCONJ, ...
1    [AUX, PRON, DET, ADJ, ADJ, ADJ, NOUN, PUNCT, I...
2    [ADV, SCONJ, PRON, ADJ, NOUN, PUNCT, PROPN, PR...
3    [PROPN, PROPN, AUX, DET, NOUN, PRON, PUNCT, AD...
4    [INTJ, PRON, DET, VERB, NOUN, PRON, VERB, PUNC...
Name: pos_tags, dtype: object


In [None]:
print(combined_sample['pos_tags'].tail())


1995    [PROPN, PUNCT, PROPN, PUNCT, PUNCT, PROPN, PRO...
1996    [PROPN, PUNCT, PROPN, PUNCT, PUNCT, PROPN, PRO...
1997    [PROPN, PUNCT, PROPN, PUNCT, PUNCT, DET, PROPN...
1998    [PROPN, PUNCT, PROPN, PUNCT, PUNCT, PROPN, NOU...
1999    [PROPN, PUNCT, PROPN, PUNCT, PUNCT, PROPN, PRO...
Name: pos_tags, dtype: object


**NER**

In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Disable components not needed for NER
with nlp.disable_pipes('ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'):
    fake_df_sample['ner_entities'] = fake_df_sample['text'].apply(lambda text: spacy_ner(text))
    true_df_sample['ner_entities'] = true_df_sample['text'].apply(lambda text: spacy_ner(text))

# Print the results to verify
print("Fake Dataset Sample with NER Entities:")
print(fake_df_sample[['text', 'ner_entities']].head())

print("\nTrue Dataset Sample with NER Entities:")
print(true_df_sample[['text', 'ner_entities']].head())


Fake Dataset Sample with NER Entities:
                                                    text ner_entities
16905  every american should be demanding answers abo...           []
6055   are we a post-racial nation? no, we re not, no...           []
1096   ever since his presidential campaign, donald t...           []
19143  brigitte gabriel is a christian who, after bei...           []
13041  oh what a tangled web we weave, when first we ...           []

True Dataset Sample with NER Entities:
                                                    text ner_entities
13265  paris (reuters) - france won t let the officia...           []
5319   washington (reuters) - u.s. president donald t...           []
15092  danang, vietnam (reuters) - russian president ...           []
1947   (reuters) - a federal judge has rejected senat...           []
14177  dubai (reuters) - saudi foreign minister adel ...           []


In [None]:
# Count the occurrences of different entity types
fake_entities = fake_df_sample['ner_entities'].explode().value_counts()
true_entities = true_df_sample['ner_entities'].explode().value_counts()

print("Fake Dataset Entities Frequency:")
print(fake_entities)

print("\nTrue Dataset Entities Frequency:")
print(true_entities)

Fake Dataset Entities Frequency:
Series([], Name: count, dtype: int64)

True Dataset Entities Frequency:
Series([], Name: count, dtype: int64)


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
fake_entity_features = mlb.fit_transform(fake_df_sample['ner_entities'])
true_entity_features = mlb.transform(true_df_sample['ner_entities'])
X_fake = pd.DataFrame(index=fake_df_sample.index)
X_true = pd.DataFrame(index=true_df_sample.index)
X_fake = pd.concat([X_fake, pd.DataFrame(fake_entity_features, columns=mlb.classes_, index=fake_df_sample.index)], axis=1)
X_true = pd.concat([X_true, pd.DataFrame(true_entity_features, columns=mlb.classes_, index=true_df_sample.index)], axis=1)

In [None]:
existing_features_fake = fake_df_sample[['title', 'text']]
existing_features_true = true_df_sample[['title', 'text']]

X_fake = pd.concat([X_fake, existing_features_fake], axis=1)
X_true = pd.concat([X_true, existing_features_true], axis=1)

In [None]:
existing_features_fake = fake_df_sample[['title', 'text']]
existing_features_true = true_df_sample[['title', 'text']]


In [None]:
X_fake = pd.concat([X_fake, existing_features_fake], axis=1)
X_true = pd.concat([X_true, existing_features_true], axis=1)


In [None]:
y_fake = fake_df_sample['label']
y_true = true_df_sample['label']


In [None]:
X_combined = pd.concat([X_fake, X_true], axis=0)
y_combined = pd.concat([y_fake, y_true], axis=0)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 1600 samples
Test set size: 400 samples


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # You can adjust the number of features
X_fake_text = vectorizer.fit_transform(fake_df_sample['text'])
X_true_text = vectorizer.transform(true_df_sample['text'])
X_fake_text_df = pd.DataFrame(X_fake_text.toarray(), columns=vectorizer.get_feature_names_out(), index=fake_df_sample.index)
X_true_text_df = pd.DataFrame(X_true_text.toarray(), columns=vectorizer.get_feature_names_out(), index=true_df_sample.index)
X_fake = pd.concat([X_fake, X_fake_text_df], axis=1)
X_true = pd.concat([X_true, X_true_text_df], axis=1)
X_combined = pd.concat([X_fake, X_true], axis=0)
y_combined = pd.concat([y_fake, y_true], axis=0)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
print(X_train.head())


                                                   title  \
16306  trump slams the globalists: “there is no globa...   
3530    nothing about those 50,000 jobs from japan is...   
1613    dan rather breaks the internet with searing r...   
11348  truth! sara carter is a gem of a reporter: “no...   
959     a panicked trump ran around the white house a...   

                                                    text  \
16306                                                      
3530   for the second time in as many weeks, trump s ...   
1613   with trump in the white house, former cbs even...   
11348                                                      
959    this incredibly awkward story comes from new y...   

                                                   title  \
16306  trump slams the globalists: “there is no globa...   
3530    nothing about those 50,000 jobs from japan is...   
1613    dan rather breaks the internet with searing r...   
11348  truth! sara carter is a gem of 

**TF**-**IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
all_text = pd.concat([fake_df_sample['text'], true_df_sample['text']])
vectorizer.fit(all_text)
X_fake_text = vectorizer.transform(fake_df_sample['text'])
X_true_text = vectorizer.transform(true_df_sample['text'])
X_fake_text_df = pd.DataFrame(X_fake_text.toarray(), columns=vectorizer.get_feature_names_out(), index=fake_df_sample.index)
X_true_text_df = pd.DataFrame(X_true_text.toarray(), columns=vectorizer.get_feature_names_out(), index=true_df_sample.index)
X_fake = X_fake.drop(columns=['text', 'title'], errors='ignore')
X_true = X_true.drop(columns=['text', 'title'], errors='ignore')
X_fake = pd.concat([X_fake, X_fake_text_df], axis=1)
X_true = pd.concat([X_true, X_true_text_df], axis=1)
X_combined = pd.concat([X_fake, X_true], axis=0)
y_combined = pd.concat([y_fake, y_true], axis=0)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       199
           1       1.00      1.00      1.00       201

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400



In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_combined, y_combined, cv=5)
print("Cross-validation scores:", scores)


Cross-validation scores: [0.9875 0.9925 1.     0.9925 0.9925]


In [None]:
import joblib
joblib.dump(clf, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [None]:
import joblib

# Save the model and vectorizer
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
import joblib
model = joblib.load('random_forest_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
new_data = ["Example text to classify"]
feature_names = vectorizer.get_feature_names_out()
new_data_transformed = vectorizer.transform(new_data)
new_data_df = pd.DataFrame(new_data_transformed.toarray(), columns=feature_names)
model_features = model.feature_names_in_
missing_features = set(model_features) - set(new_data_df.columns)
for feature in missing_features:
    new_data_df[feature] = 0
new_data_df = new_data_df[model_features]
predictions = model.predict(new_data_df)
print("Predictions:", predictions)

  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
 

Predictions: [0]


  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0
  new_data_df[feature] = 0


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.995

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       199
           1       1.00      1.00      1.00       201

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400


Confusion Matrix:
[[198   1]
 [  1 200]]


NB **Classifier**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [None]:
y_pred_nb = nb_classifier.predict(X_test)


In [None]:
accuracy_nb = nb_classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy_nb}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


Accuracy: 0.9375

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       199
           1       0.93      0.95      0.94       201

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400


Confusion Matrix:
[[184  15]
 [ 10 191]]


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
accuracy_nb = nb_classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy_nb}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


Accuracy: 0.9375

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       199
           1       0.93      0.95      0.94       201

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400


Confusion Matrix:
[[184  15]
 [ 10 191]]


In [None]:
import joblib
joblib.dump(nb_classifier, 'naive_bayes_model.pkl')


['naive_bayes_model.pkl']

**SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

In [None]:
y_pred_svm = svm_classifier.predict(X_test)


In [None]:
accuracy_svm = svm_classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy_svm}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


Accuracy: 0.9825

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       199
           1       0.98      0.99      0.98       201

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400


Confusion Matrix:
[[195   4]
 [  3 198]]


 Tuning **Hyperparameters**

In [None]:
param_grid = {
    'C': [0.1, 1],  # Reduce the range of C
    'kernel': ['linear'],  # Test with only one kernel
    'gamma': ['scale']  # Use only one gamma value
}


In [None]:

X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42)
grid_search.fit(X_train_subset, y_train_subset)


In [None]:
!pip install scipy
from scipy.stats import uniform

param_dist = {
    'C': uniform(0.1, 2),
    'kernel': ['linear'],
    'gamma': ['scale']
}



In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import uniform
import numpy as np
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist, n_iter=5, cv=2, scoring='accuracy', random_state=42)
random_search.fit(X, y)

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 199, in fit
    y = self._validate_targets(y)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 742, in _validate_targets
    raise ValueError(
ValueError: The number of classes has

In [None]:
param_dist = {
    'C': uniform(0.1, 10),  # Uniform distribution for 'C' between 0.1 and 10
    'kernel': ['linear', 'rbf'],  # Possible kernel types
    'gamma': ['scale', 'auto']  # Options for gamma
}


In [37]:
random_search.fit(X, y)


10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 199, in fit
    y = self._validate_targets(y)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 742, in _validate_targets
    raise ValueError(
ValueError: The number of classes h

In [None]:
print("Best Parameters:", random_search.best_params_)


Best Parameters: {'C': 0.849080237694725, 'gamma': 'scale', 'kernel': 'linear'}


In [None]:
best_svm = random_search.best_estimator_
print("Best Model:", best_svm)


Best Model: SVC(C=0.849080237694725, kernel='linear')


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = best_svm.predict(X)
print("Classification Report:")
print(classification_report(y, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3


Confusion Matrix:
[[2 0]
 [1 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
