# Load data

In [1]:
import pandas as pd
import numpy as np

## Load positive and negative words

In [2]:
neg_words = pd.DataFrame(
    open("../data/negative-words.txt", encoding="utf-8").read().splitlines(),
    columns=["word"]
)

pos_words = pd.DataFrame(
    open("../data/negative-words.txt", encoding="utf-8").read().splitlines(),
    columns=["word"]
)

neg_words["label"] = 0
pos_words["label"] = 1


word_df = pd.concat([pos_words, neg_words], ignore_index=True)

#shuffle rows
word_df = word_df.sample(frac=1).reset_index(drop=True)

word_df.head()

Unnamed: 0,word,label
0,ill-tempered,0
1,corrosions,0
2,bravado,0
3,unravel,1
4,turmoil,0


## Load positive and negative reviews

In [3]:
neg_reviews = pd.DataFrame(
    open("../data/negative-reviews.txt", encoding="utf-8").read().splitlines(),
    columns=["review"]
)

pos_reviews = pd.DataFrame(
    open("../data/positive-reviews.txt", encoding="utf-8").read().splitlines(),
    columns=["review"]
)

neg_reviews["label"] = 0
pos_reviews["label"] = 1


review_df = pd.concat([neg_reviews, pos_reviews], ignore_index=True)

#shuffle rows
review_df = review_df.sample(frac=1).reset_index(drop=True)

# Feature Selection

## Count Positive and Negative words

In [4]:
pos_set = set(pos_words["word"])
neg_set = set(neg_words["word"])


review_df["pos_counts"] = review_df["review"].apply(
    lambda text: sum(word.lower() in pos_set for word in text.split())
)

review_df["neg_counts"] = review_df["review"].apply(
    lambda text: sum(word.lower() in neg_set for word in text.split())
)

### Positive counts > 0

In [5]:
review_df[review_df["pos_counts"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts
1,"TINY ink wells, expensive refills, noisy print...",0,2,2
2,expensive ink,0,1,1
3,"No low-ink indicator, high cartridge costs, me...",0,1,1
4,Good value; fast; zoom; uses AA batteries inst...,1,1,1
13,Small size could be a problem for large hands/...,0,1,1
...,...,...,...,...
39983,"annoying Mac software, networkability",0,1,1
39985,"Not cheap, sound could be better, magnet-based...",0,2,2
39995,Battery life is horrible!! Small buttons diff...,0,1,1
39998,"Complicated USB installation , not as fast as ...",0,2,2


### Negative counts > 0

In [6]:
review_df[review_df["neg_counts"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts
1,"TINY ink wells, expensive refills, noisy print...",0,2,2
2,expensive ink,0,1,1
3,"No low-ink indicator, high cartridge costs, me...",0,1,1
4,Good value; fast; zoom; uses AA batteries inst...,1,1,1
13,Small size could be a problem for large hands/...,0,1,1
...,...,...,...,...
39983,"annoying Mac software, networkability",0,1,1
39985,"Not cheap, sound could be better, magnet-based...",0,2,2
39995,Battery life is horrible!! Small buttons diff...,0,1,1
39998,"Complicated USB installation , not as fast as ...",0,2,2


## Count number of "no"

In [7]:
review_df["no_counts"] = review_df["review"].apply(
    lambda text: sum(word.lower() == 'no' for word in text.split())
)

review_df[review_df["no_counts"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts,no_counts
0,no usb out of the box,0,0,0,1
3,"No low-ink indicator, high cartridge costs, me...",0,1,1,1
7,No re-chargable batteries or charger supplied ...,0,0,0,1
18,"Tiny screen, no WAP/web, limited battery life",0,1,1,1
21,"NO SPEAKER PHONE, NOT DUAL MODE.",0,0,0,1
...,...,...,...,...,...
39841,"Somewhat large / heavy, no protective lens fil...",0,0,0,1
39883,Limited capacity and no LCD for preview,0,1,1,1
39918,"Non-standard earbud, few accessories, no repai...",0,0,0,1
39945,"No case, very poor photos in dim light, long s...",0,3,3,1


## Contain "!"

In [8]:
review_df["has_exclamation"] = review_df["review"].apply(
    lambda text: 1 if '!' in text else 0
)

review_df[review_df["has_exclamation"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts,no_counts,has_exclamation
35,None yet!,0,0,0,0,1
63,"EVERYTHING! dark pictures, blurry images, cheap",0,3,3,0,1
78,Haven't found any yet!,0,0,0,0,1
92,"Print Quality, Print Quality, Print Quality!",1,0,0,0,1
95,"Durable; withstands ""wear and tear."" Lets you ...",1,0,0,0,1
...,...,...,...,...,...,...
39956,Great deal for the money!,1,0,0,0,1
39966,It's an EPSON people! Excellent paper handling,1,0,0,0,1
39989,"One color, flip feels flimsy, buggie!",0,0,0,0,1
39992,NONE AT ALL!!!,0,0,0,0,1


## Count Pronouns

In [9]:
def count_pronoun(text):
    pronouns = ['i', 'me', 'my', 'you', 'your', 'we', 'our', 'us']
    total = 0
    words = text.split()
    for w in words:
        for p in pronouns:
            if p == w.lower():
                total += 1
    return total
    
review_df["pronoun_counts"] = review_df["review"].apply(
    count_pronoun
)

review_df[review_df["pronoun_counts"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts,no_counts,has_exclamation,pronoun_counts
39,Buy the Camera Dock when you purchase the Camera,0,0,0,0,0,1
40,Control every aspect of your photograph even t...,1,0,0,0,0,1
43,"Fantastic value, great images, solid camera, G...",1,0,0,0,0,1
44,"unintuitive GUI, 1 day battery life, crashes, ...",0,1,1,0,0,1
49,No Sony Memorystick drive. First one I bought ...,0,0,0,1,0,1
...,...,...,...,...,...,...,...
39928,everything you need to get started is in the box,1,0,0,0,0,1
39935,For the Photo Enthusiast you will want more ma...,0,0,0,0,0,1
39952,too big for me,0,0,0,0,0,1
39974,Haven't found anything I don't like,0,0,0,0,0,1


## Get log(len(review))

In [10]:
review_df["log2_review"] = review_df["review"].apply(
    lambda text: np.log2(len(text) or 1)
)

review_df[review_df["log2_review"] > 0]

Unnamed: 0,review,label,pos_counts,neg_counts,no_counts,has_exclamation,pronoun_counts,log2_review
0,no usb out of the box,0,0,0,1,0,0,4.392317
1,"TINY ink wells, expensive refills, noisy print...",0,2,2,0,0,0,5.754888
2,expensive ink,0,1,1,0,0,0,3.700440
3,"No low-ink indicator, high cartridge costs, me...",0,1,1,1,0,0,5.882643
4,Good value; fast; zoom; uses AA batteries inst...,1,1,1,0,0,0,6.672425
...,...,...,...,...,...,...,...,...
39995,Battery life is horrible!! Small buttons diff...,0,1,1,0,1,0,6.000000
39996,"Sleek, great form factor, has a keyboard",1,0,0,0,0,0,5.321928
39997,"Crisp pictures, Stylish, Easy Connection, User...",1,0,0,0,0,0,5.807355
39998,"Complicated USB installation , not as fast as ...",0,2,2,0,0,0,6.554589


# Train the model

## Use LinearSVC from sklearn to build our SVM classification model

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

features = ['review', 'pos_counts', 'neg_counts', 'no_counts', 'has_exclamation', 'pronoun_counts', 'log2_review']
text_features = 'review'
numeric_features = [
    'pos_counts',
    'neg_counts',
    'no_counts',
    'has_exclamation',
    'pronoun_counts',
    'log2_review'
]

X = review_df[features]
y = review_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), text_features),
        ('num', 'passthrough', numeric_features)
    ]
)

model = Pipeline([
    ('preprocess', preprocessor),
    ('nb', MultinomialNB())
])

model.fit(X_train, y_train)


X_train:
                                                   review  pos_counts  \
14307                                       None so far!           0   
17812  Known issues with screen assembly in early bui...           1   
11020                       easy to use, saves to floppy           0   
15158                                               none           0   
24990             Print Quality, Ease of setup, Reliable           0   
...                                                  ...         ...   
6265   Not a  amp;quot;solid amp;quot; printer, NO WI...           0   
11284                             Small battery, red-eye           0   
38158                        size, weight, photo options           0   
860                                   small, cute, price           0   
15795                  Palm Pilot and phone in one unit,           0   

       neg_counts  no_counts  has_exclamation  pronoun_counts  log2_review  
14307           0          0                1   

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('preprocess', ...), ('nb', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"transformers  transformers: list of tuples List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. name : str  Like in Pipeline and FeatureUnion, this allows the transformer and  its parameters to be set using ``set_params`` and searched in grid  search. transformer : {'drop', 'passthrough'} or estimator  Estimator must support :term:`fit` and :term:`transform`.  Special-cased strings 'drop' and 'passthrough' are accepted as  well, to indicate to drop the columns or to pass them through  untransformed, respectively. columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable  Indexes the data on its second axis. Integers are interpreted as  positional columns, while strings can reference DataFrame columns  by name. A scalar string or int should be used where  ``transformer`` expects X to be a 1d array-like (vector),  otherwise a 2d array will be passed to the transformer.  A callable is passed the input data `X` and can return any of the  above. To select multiple columns by name or dtype, you can use  :obj:`make_column_selector`.","[('text', ...), ('num', ...)]"
,"remainder  remainder: {'drop', 'passthrough'} or estimator, default='drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers`, but present in the data passed to `fit` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. For dataframes, extra columns not seen during `fit` will be excluded from the output of `transform`. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support :term:`fit` and :term:`transform`. Note that using this feature requires that the DataFrame columns input at :term:`fit` and :term:`transform` have identical order.",'drop'
,"sparse_threshold  sparse_threshold: float, default=0.3 If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.",0.3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"transformer_weights  transformer_weights: dict, default=None Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed.",False
,"verbose_feature_names_out  verbose_feature_names_out: bool, str or Callable[[str, str], str], default=True - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix  all feature names with the name of the transformer that generated that  feature. It is equivalent to setting  `verbose_feature_names_out=""{transformer_name}__{feature_name}""`. - If False, :meth:`ColumnTransformer.get_feature_names_out` will not  prefix any feature names and will error if feature names are not  unique. - If ``Callable[[str, str], str]``,  :meth:`ColumnTransformer.get_feature_names_out` will rename all the features  using the name of the transformer. The first argument of the callable is the  transformer name and the second argument is the feature name. The returned  string will be the new feature name. - If ``str``, it must be a string ready for formatting. The given string will  be formatted using two field names: ``transformer_name`` and ``feature_name``.  e.g. ``""{feature_name}__{transformer_name}""``. See :meth:`str.format` method  from the standard library for more info. .. versionadded:: 1.0 .. versionchanged:: 1.6  `verbose_feature_names_out` can be a callable or a string to be formatted.",True
,"force_int_remainder_cols  force_int_remainder_cols: bool, default=False This parameter has no effect. .. note::  If you do not access the list of columns for the remainder columns  in the `transformers_` fitted attribute, you do not need to set  this parameter. .. versionadded:: 1.5 .. versionchanged:: 1.7  The default value for `force_int_remainder_cols` will change from  `True` to `False` in version 1.7. .. deprecated:: 1.7  `force_int_remainder_cols` is deprecated and will be removed in 1.9.",'deprecated'

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


# Result

In [12]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Results")

result_df = pd.DataFrame({
    "X_test": X_test['review'],
    "y_test": y_test,
    "y_pred": y_pred
})

result_df.head()


Accuracy: 0.876625
              precision    recall  f1-score   support

           0       0.91      0.84      0.87      4003
           1       0.85      0.91      0.88      3997

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000

Results


Unnamed: 0,X_test,y_test,y_pred
32823,"No AF illumination beam, no manual exposure op...",0,0
16298,"Low battery life, not very durable, lack of ac...",0,0
28505,Trouble free connection to PC with Infra-Red p...,1,0
6689,Excellent print quality. Price. Handy bundled ...,1,1
26893,Cheap,1,0


## Select only the review with no features

In [13]:
X = review_df['review']
y = review_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svm", LinearSVC())
])

model.fit(X_train, y_train)

X_train:
 14307                                         None so far!
17812    Known issues with screen assembly in early bui...
11020                         easy to use, saves to floppy
15158                                                 none
24990               Print Quality, Ease of setup, Reliable
                               ...                        
6265     Not a  amp;quot;solid amp;quot; printer, NO WI...
11284                               Small battery, red-eye
38158                          size, weight, photo options
860                                     small, cute, price
15795                    Palm Pilot and phone in one unit,
Name: review, Length: 32000, dtype: object
y_train:
 14307    0
17812    0
11020    1
15158    0
24990    1
        ..
6265     0
11284    0
38158    1
860      1
15795    1
Name: label, Length: 32000, dtype: int64
X_test:
 32823    No AF illumination beam, no manual exposure op...
16298    Low battery life, not very durable, lack of ac...

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tfidf', ...), ('svm', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [14]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Results")

result_df = pd.DataFrame({
    "X_test": X_test,
    "y_test": y_test,
    "y_pred": y_pred
})

result_df.head()

Accuracy: 0.9335
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      4003
           1       0.94      0.93      0.93      3997

    accuracy                           0.93      8000
   macro avg       0.93      0.93      0.93      8000
weighted avg       0.93      0.93      0.93      8000

Results


Unnamed: 0,X_test,y_test,y_pred
32823,"No AF illumination beam, no manual exposure op...",0,0
16298,"Low battery life, not very durable, lack of ac...",0,0
28505,Trouble free connection to PC with Infra-Red p...,1,0
6689,Excellent print quality. Price. Handy bundled ...,1,1
26893,Cheap,1,1
