In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pennylane as qml
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem import PorterStemmer
from pennylane import numpy as qnp
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv('C:\Users\Asus\Documents\QML\project\IMDB Dataset.csv')

In [3]:
tokenizer = ToktokTokenizer()
stopword_list = set(stopwords.words('english'))

In [4]:
import re


def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-Z\s]' if remove_digits else r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

In [5]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [6]:
data['review'] = data['review'].apply(remove_special_characters)
data['review'] = data['review'].apply(remove_stopwords)

In [7]:
print(data)

                                                  review sentiment
0      One reviewers mentioned watching Oz episode yo...  positive
1      wonderful little production br br filming tech...  positive
2      thought wonderful way spend time hot summer we...  positive
3      Basically theres family little boy Jake thinks...  negative
4      Petter Matteis Love Time Money visually stunni...  positive
...                                                  ...       ...
49995  thought movie right good job wasnt creative or...  positive
49996  Bad plot bad dialogue bad acting idiotic direc...  negative
49997  Catholic taught parochial elementary schools n...  negative
49998  Im going disagree previous comment side Maltin...  negative
49999  one expects Star Trek movies high art fans exp...  negative

[50000 rows x 2 columns]


In [8]:
texts = data['review'].values
labels = data['sentiment'].values

In [9]:
y = np.array([1 if label == 'positive' else 0 for label in labels])

In [10]:
vectorizer = TfidfVectorizer(max_features=1100)
X = vectorizer.fit_transform(texts).toarray()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)

In [12]:
from sklearn.decomposition import IncrementalPCA
num_components = min(X_train.shape[0], X_train.shape[1])  # Set num_components to the minimum of samples or features
pca = IncrementalPCA(n_components=num_components)  # Use IncrementalPCA for large datasets
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [13]:
scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
num_qubits = num_components  # Number of qubits should match the number of PCA components
dev = qml.device('default.qubit', wires=num_qubits)

In [15]:
def qkernel(X, Y):
    @qml.qnode(dev)
    def quantum_circuit(x):
        for i in range(num_qubits):
            qml.RX(x[i], wires=i)
        for i in range(num_qubits - 1):
            qml.CNOT(wires=[i, i + 1])
        return qml.expval(qml.PauliZ(0))
    
    kernel_matrix = np.zeros((X.shape[0], Y.shape[0]))
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            kernel_matrix[i, j] = quantum_circuit(np.concatenate([X[i], Y[j]]))
    return kernel_matrix

In [16]:
qkernel = 'linear'  # Consider using a simpler kernel if needed
qsvm = SVC(kernel=qkernel, verbose=True)  # Enable verbosity to monitor training progress
qsvm.fit(X_train, y_train)

[LibSVM]

In [17]:
y_pred = qsvm.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")

report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print(report)

Accuracy: 85.80%
              precision    recall  f1-score   support

    Negative       0.87      0.85      0.86     12483
    Positive       0.85      0.87      0.86     12517

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

