In [22]:
import pandas as pd
import numpy as np
import os

In [23]:
import spacy 
nlp = spacy.load("en_core_web_sm")
nlp=spacy.load('en_core_web_sm',disable=['parser','ner'])

In [24]:
df = pd.read_csv('tagged-words.csv')
df.head()

Unnamed: 0,word,tag
0,the,DET
1,fulton,NOUN
2,county,NOUN
3,grand,ADJ
4,jury,NOUN


In [25]:
df.shape

(1161192, 2)

In [26]:
len(df.value_counts())

53532

In [27]:
df['word'] = df['word'].str.lower()
df['tag'] = df['tag'].str.upper()

In [28]:
f2 = df.value_counts().reset_index()
f2.columns = ['word','tag','count']
f2.head()

Unnamed: 0,word,tag,count
0,!,ADJ,147565
1,the,DET,69968
2,of,ADP,36410
3,and,CONJ,28850
4,a,DET,23070


In [29]:
# Group by word and tag, count occurrences
freq_table = df.groupby(['word', 'tag']).size().reset_index(name='count')

# For each word, select the tag with max frequency
most_common_tag = freq_table.loc[freq_table.groupby('word')['count'].idxmax()]

# Convert to dictionary: {word: tag}
word_tag_dict = dict(zip(most_common_tag['word'], most_common_tag['tag']))

print(list(word_tag_dict.items())[:10])


[('!', 'ADJ'), ('$.027', 'NOUN'), ('$.03', 'NOUN'), ('$.054/mbf', 'NOUN'), ('$.07', 'NOUN'), ('$.07/cwt', 'NOUN'), ('$.076', 'NOUN'), ('$.09', 'NOUN'), ('$.10-a-minute', 'NOUN'), ('$.105', 'NOUN')]


In [30]:
def simple_pos_tagger(tokens, word_tag_dict, default_tag='NOUN'):
    tags = []
    for word in tokens:
        tag = word_tag_dict.get(word.lower(), default_tag)
        tags.append((word, tag))
    return tags


In [31]:
sentence = "I saw him running away"
tokens = sentence.split()

print(simple_pos_tagger(tokens, word_tag_dict))


[('I', 'PRON'), ('saw', 'VERB'), ('him', 'PRON'), ('running', 'VERB'), ('away', 'ADV')]


In [32]:
sentence = "He wished he was rich"
tokens = sentence.split()
print(simple_pos_tagger(tokens, word_tag_dict))

[('He', 'PRON'), ('wished', 'VERB'), ('he', 'PRON'), ('was', 'VERB'), ('rich', 'ADJ')]


In [34]:
# Create frequency table (counts)
emission_counts = pd.crosstab(index=df['word'], columns=df['tag'])

# Normalize by columns (so each column sums to 1)
emission_matrix = pd.crosstab(index=df['word'], columns=df['tag'], normalize='columns')

# Optional: view first few rows
print(emission_matrix.head())


tag             ADJ  ADP  ADV  CONJ  DET      NOUN  NUM  PRON  PRT  VERB    X
word                                                                         
!          0.638047  0.0  0.0   0.0  0.0  0.000000  0.0   0.0  0.0   0.0  0.0
$.027      0.000000  0.0  0.0   0.0  0.0  0.000007  0.0   0.0  0.0   0.0  0.0
$.03       0.000000  0.0  0.0   0.0  0.0  0.000015  0.0   0.0  0.0   0.0  0.0
$.054/mbf  0.000000  0.0  0.0   0.0  0.0  0.000004  0.0   0.0  0.0   0.0  0.0
$.07       0.000000  0.0  0.0   0.0  0.0  0.000011  0.0   0.0  0.0   0.0  0.0


In [35]:
p_his_given_pron = emission_matrix.loc['his', 'PRON']
p_his_given_pron = round(p_his_given_pron, 3)
print(p_his_given_pron)


0.001


In [36]:

df['next_tag'] = df['tag'].shift(-1)
df = df[df['tag'] != '.']

transition_matrix = pd.crosstab(index=df['tag'],
                                columns=df['next_tag'],
                                normalize='index')

print(round(transition_matrix, 3))


next_tag    ADJ    ADP    ADV   CONJ    DET   NOUN    NUM   PRON    PRT  \
tag                                                                       
ADJ       0.173  0.103  0.054  0.070  0.097  0.323  0.014  0.070  0.028   
ADP       0.092  0.020  0.016  0.002  0.456  0.258  0.030  0.070  0.014   
ADV       0.307  0.142  0.097  0.017  0.074  0.033  0.013  0.048  0.029   
CONJ      0.133  0.073  0.091  0.000  0.151  0.244  0.019  0.068  0.025   
DET       0.253  0.009  0.018  0.001  0.006  0.627  0.010  0.010  0.002   
NOUN      0.297  0.245  0.027  0.060  0.016  0.150  0.008  0.020  0.018   
NUM       0.333  0.132  0.020  0.038  0.014  0.381  0.022  0.009  0.005   
PRON      0.113  0.056  0.054  0.011  0.018  0.009  0.001  0.008  0.024   
PRT       0.096  0.091  0.036  0.012  0.084  0.036  0.005  0.007  0.011   
VERB      0.138  0.169  0.103  0.014  0.163  0.098  0.009  0.055  0.066   
X         0.277  0.053  0.007  0.023  0.006  0.056  0.001  0.006  0.007   

next_tag   VERB      X  