In [1]:
def find_sentiment(sentence, pos, neg):
    
    """ 文章の感情を返す関数 
    :param sentence: 文章, string型  
    :param pos: 肯定的な単語のセット 
    :param neg: 否定的な単語のセット
    :return: 肯定的、否定的、中立的のいずれか """ 
    
    #半角スペースで文章を追加
    #"this is a sentence!" は: 
    #["this", "is" "a", "sentence!"] になる
    #ここでは一つ以上の半角スペースで区切っている
    #もし意図的に一つの半角スペースで区切りたい場合 .split("")を使う 
    sentence = sentence.split() 
    
    #文のリストをセットに変換
    sentence = set(sentence) 
    #肯定的な単語のセットと共通している単語数
    num_common_pos = len(sentence.intersection(pos)) 
    
    #否定的な単語のセットと共通している単語数
    num_common_neg = len(sentence.intersection(neg)) 
    
    #条件分岐
    #早期リターンで if else を回避
    
    if num_common_pos > num_common_neg:
        return "positive" 
    
    if num_common_pos < num_common_neg:
        return "negative" 
    
    return "neutral" 

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rebor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
from nltk.tokenize import word_tokenize

sentence = "hi, how are you?"


sentence.split()

['hi,', 'how', 'are', 'you?']

In [5]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer 
#文章のコーパスを作成
corpus = [ "hello, how are you?", 
          "im getting bored at home. And you? What do you think?",
          "did you know about counts", "let's see if this works!", "YES!!!!" ] 

#initialize CountVectorizer 
ctv = CountVectorizer() 
#学習
ctv.fit(corpus) 

corpus_transformed = ctv.transform(corpus) 

In [7]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [8]:
print(ctv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


In [9]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize 

#文章のコーパスを作成
corpus = [ "hello, how are you?", 
          "im getting bored at home. And you? What do you think?", 
          "did you know about counts", "let's see if this works!", 
          "YES!!!!" ] 

#tokenizerにnltkのword_tokenizeを指定してCountVectorizerを初期化


ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None) 

#fit the vectorizer on corpus 
ctv.fit(corpus) 

corpus_transformed = ctv.transform(corpus) 

print(ctv.vocabulary_) 

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import word_tokenize 

#文章のコーパスを作成 
corpus = ["hello, how are you?", 
          "im getting bored at home. And you? What do you think?",
          "did you know about counts", 
          "let's see if this works!", 
          "YES!!!!" ] 

#tokenizerにnltkのword_tokenizeを指定してTfidfVectorizer を初期化 
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None) 
#学習
tfv.fit(corpus) 
corpus_transformed = tfv.transform(corpus) 
print(corpus_transformed)

  (0, 27)	0.2965698850220162
  (0, 16)	0.4428321995085722
  (0, 14)	0.4428321995085722
  (0, 7)	0.4428321995085722
  (0, 4)	0.35727423026525224
  (0, 2)	0.4428321995085722
  (1, 27)	0.35299699146792735
  (1, 24)	0.2635440111190765
  (1, 22)	0.2635440111190765
  (1, 18)	0.2635440111190765
  (1, 15)	0.2635440111190765
  (1, 13)	0.2635440111190765
  (1, 12)	0.2635440111190765
  (1, 9)	0.2635440111190765
  (1, 8)	0.2635440111190765
  (1, 6)	0.2635440111190765
  (1, 4)	0.42525129752567803
  (1, 3)	0.2635440111190765
  (2, 27)	0.31752680284846835
  (2, 19)	0.4741246485558491
  (2, 11)	0.4741246485558491
  (2, 10)	0.4741246485558491
  (2, 5)	0.4741246485558491
  (3, 25)	0.38775666010579296
  (3, 23)	0.38775666010579296
  (3, 21)	0.38775666010579296
  (3, 20)	0.38775666010579296
  (3, 17)	0.38775666010579296
  (3, 1)	0.38775666010579296
  (3, 0)	0.3128396318588854
  (4, 26)	0.2959842226518677
  (4, 0)	0.9551928286692534


In [12]:
from nltk import ngrams 
from nltk.tokenize import word_tokenize 
#3 grams 
N = 3 

#input sentence 
sentence = "hi, how are you?" 

#tokenized sentence 
tokenized_sentence = word_tokenize(sentence) 

#generate n_grams 
n_grams = list(ngrams(tokenized_sentence, N)) 
print(n_grams)

[('hi', ',', 'how'), (',', 'how', 'are'), ('how', 'are', 'you'), ('are', 'you', '?')]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tdidf_vsc = TfidfVectorizer(
                            tokenizer = word_tokenize,
                            token_pattern = None,
                            ngram_range= (1,3)
)

In [16]:
# wordnetの導入
# 本にはない
# look up error対策
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rebor\AppData\Roaming\nltk_data...


True

In [17]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer 

#initialize lemmatizer 
lemmatizer = WordNetLemmatizer()

#initialize stemmer 
stemmer = SnowballStemmer("english") 
words = ["fishing", "fishes", "fished"] 
for word in words:
    print(f"word={word}") 
    print(f"stemmed_word={stemmer.stem(word)}") 
    print(f"lemma={lemmatizer.lemmatize(word)}") 
    print("")

word=fishing
stemmed_word=fish
lemma=fishing

word=fishes
stemmed_word=fish
lemma=fish

word=fished
stemmed_word=fish
lemma=fished

