In [1]:
# Imports useful packages
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to preprocess the data
def extract_data():
    # Extracts data from a CSV file
    data = pd.read_csv('IMDB Dataset.csv')
    return data['text'], data['label']  # Returns reviews and labels

# Function for feature extraction
def get_tfidf_vectors(sentence):
    """
    :description: gets the TF-IDF values for the words in each sentence.
    :parameters: sentences as strings saved in list.
    :return: a list with TF-IDF values for each sentence (will be used as features).
    """
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentence)
    print("n_samples: %d, n_features: %d" % vectors.shape)
    tf_idf = pd.DataFrame(vectors.todense()).iloc[:len(sentence)]
    tf_idf.columns = vectorizer.get_feature_names_out()
    tfidf_matrix = tf_idf.T

    features = []
    for i in tfidf_matrix:
        features.append(list(tfidf_matrix[i]))
    return features

# Function for training the SVM model
def svm_classifier(train_features, train_labels, test_features):
    clf = SVC(kernel='sigmoid')  # Selects and defines the SVM model
    clf.fit(train_features, train_labels)  # Trains the model
    pred = clf.predict(test_features)  # Predicts labels of test features
    return pred

# --- Step 1: Data Preprocessing ---
sentence, labels = extract_data()  # Extract sentence and gold labels

# --- Step 2: Feature Extraction ---
sentence_features = get_tfidf_vectors(sentence)  # Get TF-IDF features

# --- Step 3: Split data into training and test sets ---
train_features, test_features, train_labels, test_labels = train_test_split(
    sentence_features, labels, test_size=0.2, stratify=labels
)

# Check if number of features and labels match
if len(train_features) == len(train_labels):
    print("\nNumber of Features and Labels Match.")

# --- Steps 4 and 5: Training and Predictions ---
pred = svm_classifier(train_features, train_labels, test_features)  # Train and predict
print("\nModel Predictions:", pred)  # Output predictions


  from pandas.core import (


ModuleNotFoundError: No module named 'sklearn.svm'

In [2]:
import pandas as pd

In [8]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from scipy.spatial.distance import cdist

In [4]:
!python -m venv sklearn-env
!source sklearn-env/bin/activate  # activate
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 2.2.0
    Uninstalling threadpoolctl-2.2.0:
      Successfully uninstalled threadpoolctl-2.2.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.2
    Uninstalling scikit-learn-1.4.2:
      Successfully uninstalled scikit-learn-1.4.2
Successfully installed scikit-learn-1.5.2 threadpoolctl-3.5.0


In [6]:
!python -m pip show scikit-learn  # show scikit-learn version and location
!python -m pip freeze             # show all installed packages in the environment
!python -c "import sklearn; sklearn.show_versions()"

Name: scikit-learn
Version: 1.5.2
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

Copyright (c) 2007-2024 The scikit-learn developers.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS 