In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
texts = [
    "basketball is a team sport where teams shoot a basketball",
    "football is a sport where teams score goals"
]

labels=[1,0] #1 means basketball and 0 means football

#fit vectorizer on texts  
vectorizer=CountVectorizer(ngram_range=(1,1))
vectorizer.fit(texts)


CountVectorizer()

In [3]:
ngrams=vectorizer.transform(texts)
ngrams.todense()

matrix([[2, 0, 0, 1, 0, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 0, 1, 0, 1, 1]], dtype=int64)

In [4]:
vectorizer.vocabulary_


{'basketball': 0,
 'is': 3,
 'team': 7,
 'sport': 6,
 'where': 9,
 'teams': 8,
 'shoot': 5,
 'football': 1,
 'score': 4,
 'goals': 2}

In [5]:
# create a pandas dataframe that shows the unigrams in each text
keys_values_sorted = sorted(list(vectorizer.vocabulary_.items()), key=lambda t: t[1])
keys_sorted = list(zip(*keys_values_sorted))[0]
ngrams_matrix = ngrams.todense()
df = pd.DataFrame(ngrams_matrix, columns=keys_sorted)
df

Unnamed: 0,basketball,football,goals,is,score,shoot,sport,team,teams,where
0,2,0,0,1,0,1,1,1,1,1
1,0,1,1,1,1,0,1,0,1,1


In [7]:
#Train Model
model=LogisticRegression()
model.fit(ngrams, labels)

# show logistic regression weights
from_unigram_to_weight = dict(zip(keys_sorted, model.coef_[0]))
from_unigram_to_weight

{'basketball': 0.49464700259889355,
 'football': -0.24731844154410243,
 'goals': -0.24731844154410243,
 'is': 5.059755344306952e-06,
 'score': -0.24731844154410243,
 'shoot': 0.24732350129944677,
 'sport': 5.059755344306952e-06,
 'team': 0.24732350129944677,
 'teams': 5.059755344306952e-06,
 'where': 5.059755344306952e-06}

# Using Bigrams

In [9]:
vectorizer=CountVectorizer(ngram_range=(1,2))
vectorizer.fit(texts)
ngrams=vectorizer.transform(texts)
ngrams.todense()

matrix([[2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]],
       dtype=int64)

In [10]:
vectorizer.vocabulary_

{'basketball': 0,
 'is': 5,
 'team': 14,
 'sport': 12,
 'where': 19,
 'teams': 16,
 'shoot': 10,
 'basketball is': 1,
 'is team': 7,
 'team sport': 15,
 'sport where': 13,
 'where teams': 20,
 'teams shoot': 18,
 'shoot basketball': 11,
 'football': 2,
 'score': 8,
 'goals': 4,
 'football is': 3,
 'is sport': 6,
 'teams score': 17,
 'score goals': 9}

In [11]:
model.fit(ngrams, labels)

# show logistic regression weights
keys_values_sorted = sorted(list(vectorizer.vocabulary_.items()), key=lambda t: t[1])
keys_sorted = list(zip(*keys_values_sorted))[0]
from_ngram_to_weight = dict(zip(keys_sorted, model.coef_[0]))
from_ngram_to_weight

{'basketball': 0.34694141625374525,
 'basketball is': 0.17347070812687262,
 'football': -0.173468461920609,
 'football is': -0.173468461920609,
 'goals': -0.173468461920609,
 'is': 2.2462062636217884e-06,
 'is sport': -0.173468461920609,
 'is team': 0.17347070812687262,
 'score': -0.173468461920609,
 'score goals': -0.173468461920609,
 'shoot': 0.17347070812687262,
 'shoot basketball': 0.17347070812687262,
 'sport': 2.2462062636217884e-06,
 'sport where': 2.2462062636217884e-06,
 'team': 0.17347070812687262,
 'team sport': 0.17347070812687262,
 'teams': 2.2462062636217884e-06,
 'teams score': -0.173468461920609,
 'teams shoot': 0.17347070812687262,
 'where': 2.2462062636217884e-06,
 'where teams': 2.2462062636217884e-06}